CA4 (Machine Learning)

Training different classifiers on a dataset and test models.


University of Tehran
Mobina Mehrazar
810100216

Imports¶

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from copy import deepcopy
from IPython.display import display, HTML

from sklearn import metrics
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from mlxtend.evaluate import bias_variance_decomp

from dataclasses import dataclass

from xgboost import XGBClassifier
In [2]:
TARGET_COLUMN = 'NumPurchases'

Explorin Dataset¶

Reading a CSV File into a Pandas DataFrame¶

In [3]:
df = pd.read_csv('marketing_campaign.csv')
pd.set_option("display.max_columns", None)
df.head(5)
Out[3]:
Unnamed: 0 ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth Complain NumPurchases UsedCampaignOffer
0 0 5524 1957 Graduation Single 58138.0 0 0 04-09-2012 58 635.0 88 546 172 88 88.0 NaN 0 25 1
1 1 2174 1954 Graduation Single 46344.0 1 1 08-03-2014 38 NaN 1 6 2 1 6.0 5.0 0 6 0
2 2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 NaN 49 127 111 21 42.0 NaN 0 21 0
3 3 6182 1984 Graduation Together 26646.0 1 0 10-02-2014 26 11.0 4 20 10 3 5.0 6.0 0 8 0
4 4 5324 1981 PhD Married 58293.0 1 0 19-01-2014 94 173.0 43 118 46 27 15.0 5.0 0 19 0

DataFrame Information¶

The info method shows us the general info about our data frame—for example, the data stored in a dataset and the corresponding data types.

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2240 non-null   int64  
 1   ID                 2240 non-null   int64  
 2   Year_Birth         2240 non-null   int64  
 3   Education          2240 non-null   object 
 4   Marital_Status     2240 non-null   object 
 5   Income             2017 non-null   float64
 6   Kidhome            2240 non-null   int64  
 7   Teenhome           2240 non-null   int64  
 8   Dt_Customer        2240 non-null   object 
 9   Recency            2240 non-null   int64  
 10  MntCoffee          2035 non-null   float64
 11  MntFruits          2240 non-null   int64  
 12  MntMeatProducts    2240 non-null   int64  
 13  MntFishProducts    2240 non-null   int64  
 14  MntSweetProducts   2240 non-null   int64  
 15  MntGoldProds       2227 non-null   float64
 16  NumWebVisitsMonth  2040 non-null   float64
 17  Complain           2240 non-null   int64  
 18  NumPurchases       2240 non-null   int64  
 19  UsedCampaignOffer  2240 non-null   int64  
dtypes: float64(4), int64(13), object(3)
memory usage: 350.1+ KB
In [5]:
df.describe()
Out[5]:
Unnamed: 0 ID Year_Birth Income Kidhome Teenhome Recency MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth Complain NumPurchases UsedCampaignOffer
count 2240.000000 2240.000000 2240.000000 2017.000000 2240.000000 2240.000000 2240.000000 2035.000000 2240.000000 2240.000000 2240.000000 2240.000000 2227.000000 2040.000000 2240.000000 2240.000000 2240.000000
mean 1119.500000 5592.159821 1968.805804 52297.080317 0.437946 0.506250 49.109375 304.239312 26.302232 166.950000 37.525446 27.062946 43.847777 5.326961 0.009375 14.862054 0.271875
std 646.776623 3246.662198 11.984069 25543.108215 0.563666 0.544538 28.962453 337.515534 39.773434 225.715373 54.628979 41.280498 51.897098 2.439349 0.096391 7.677173 0.445025
min 0.000000 0.000000 1893.000000 2447.000000 -5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 559.750000 2828.250000 1959.000000 35340.000000 0.000000 0.000000 24.000000 23.000000 1.000000 16.000000 3.000000 1.000000 9.000000 3.000000 0.000000 8.000000 0.000000
50% 1119.500000 5458.500000 1970.000000 51369.000000 0.000000 0.000000 49.000000 177.000000 8.000000 67.000000 12.000000 8.000000 24.000000 6.000000 0.000000 15.000000 0.000000
75% 1679.250000 8427.750000 1977.000000 68316.000000 1.000000 1.000000 74.000000 505.000000 33.000000 232.000000 50.000000 33.000000 56.000000 7.000000 0.000000 21.000000 1.000000
max 2239.000000 11191.000000 1996.000000 666666.000000 2.000000 2.000000 99.000000 1493.000000 199.000000 1725.000000 259.000000 263.000000 362.000000 20.000000 1.000000 44.000000 1.000000

Count and Percentage of Missing Data for Each Feature¶

In [6]:
def missing_values(df: pd.DataFrame) -> pd.DataFrame:
    nan_vals_count = df.isna().sum()
    nan_vals_percent = nan_vals_count / len(df)
    nan_values = pd.concat([nan_vals_count, nan_vals_percent], axis=1, keys=["Missing", "Percentage"])
    return nan_values
missing_values(df)
Out[6]:
Missing Percentage
Unnamed: 0 0 0.000000
ID 0 0.000000
Year_Birth 0 0.000000
Education 0 0.000000
Marital_Status 0 0.000000
Income 223 0.099554
Kidhome 0 0.000000
Teenhome 0 0.000000
Dt_Customer 0 0.000000
Recency 0 0.000000
MntCoffee 205 0.091518
MntFruits 0 0.000000
MntMeatProducts 0 0.000000
MntFishProducts 0 0.000000
MntSweetProducts 0 0.000000
MntGoldProds 13 0.005804
NumWebVisitsMonth 200 0.089286
Complain 0 0.000000
NumPurchases 0 0.000000
UsedCampaignOffer 0 0.000000
In [7]:
missing_data = pd.DataFrame({
    'Feature': df.columns,
    'Missing Count': df.isnull().sum(),
    'Missing Percentage': df.isnull().mean() * 100
})

missing_data
Out[7]:
Feature Missing Count Missing Percentage
Unnamed: 0 Unnamed: 0 0 0.000000
ID ID 0 0.000000
Year_Birth Year_Birth 0 0.000000
Education Education 0 0.000000
Marital_Status Marital_Status 0 0.000000
Income Income 223 9.955357
Kidhome Kidhome 0 0.000000
Teenhome Teenhome 0 0.000000
Dt_Customer Dt_Customer 0 0.000000
Recency Recency 0 0.000000
MntCoffee MntCoffee 205 9.151786
MntFruits MntFruits 0 0.000000
MntMeatProducts MntMeatProducts 0 0.000000
MntFishProducts MntFishProducts 0 0.000000
MntSweetProducts MntSweetProducts 0 0.000000
MntGoldProds MntGoldProds 13 0.580357
NumWebVisitsMonth NumWebVisitsMonth 200 8.928571
Complain Complain 0 0.000000
NumPurchases NumPurchases 0 0.000000
UsedCampaignOffer UsedCampaignOffer 0 0.000000

Visualizing Feature Relationships and Correlations¶

Correlation Matrix with a heatmap¶

$\rho_{XY} = \frac{\text{cov}(X, Y)}{\sigma_X \sigma_Y}$

  • $\rho_{XY}$ is the correlation coefficient between variables $X$ and $Y$.
  • $\text{cov}(X, Y)$ represents the covariance between $X$ and $Y$.
  • $\sigma_X$ and $\sigma_Y$ are the standard deviations of $X$ and $Y$ respectively.

$\text{cov}(X, Y) = \frac{\sum_{i=1}^{n}(X_i - \bar{X})(Y_i - \bar{Y})}{n-1}$

  • $\text{cov}(X, Y)$ is the covariance between variables $X$ and $Y$.
  • $X_i$ and $Y_i$ are individual data points for variables $X$ and $Y$.
  • $\bar{X}$ and $\bar{Y}$ are the mean values of $X$ and $Y$ respectively.
  • $n$ is the number of data points.

$\sigma = \sqrt{\frac{\sum_{i=1}^{n}(X_i - \bar{X})^2}{n-1}}$

  • $\sigma$ is the standard deviation.
  • $X_i$ represents individual data points.
  • $\bar{X}$ is the mean of the data.
  • $n$ is the number of data points.
In [8]:
def plot_correlation_heatmap(df):
    numeric_df = df.select_dtypes(include=['number'])
    plt.figure(figsize=(15, 15))
    sns.heatmap(numeric_df.corr(), annot=True, fmt=".3f", cmap="Blues", linewidths=1, square=True)
    plt.title('Correlation Matrix Heatmap')
    plt.xticks(rotation=45, ha='right')
    plt.show()
plot_correlation_heatmap(df)
No description has been provided for this image

Features with stronger correlation to the target column (NumPurchases)¶

In [9]:
def select_features_by_correlation(df, target_column='NumPurchases', threshold_low=0.25, threshold_high=0.45):
    numeric_df = df.select_dtypes(include=['number'])
    target_corr = numeric_df.corr()[target_column].drop(target_column)
    target_corr_filtered = target_corr[abs(target_corr) > threshold_low].sort_values(ascending=False)
    selected_features = target_corr_filtered[abs(target_corr_filtered) > threshold_high].sort_values(ascending=False)
    return selected_features

selected_features = select_features_by_correlation(df, target_column='NumPurchases')
selected_features
Out[9]:
MntCoffee           0.715164
Income              0.562603
MntMeatProducts     0.554229
MntGoldProds        0.493939
MntSweetProducts    0.472876
MntFishProducts     0.469454
MntFruits           0.455461
Name: NumPurchases, dtype: float64

The thresholds are set with default values as:

  • low : 0.25
  • high : 0.45

Number of Observations for Each Unique with a Stronger Correlation.¶

In [10]:
df[selected_features.index].nunique()
Out[10]:
MntCoffee            747
Income              1810
MntMeatProducts      558
MntGoldProds         212
MntSweetProducts     177
MntFishProducts      182
MntFruits            158
dtype: int64
In [11]:
fig_width = 20
fig_height = 6
num_cols = 3
num_rows = (len(selected_features.index) + num_cols - 1) // num_cols

# Create a figure with subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(fig_width, fig_height * num_rows))

axes = axes.flatten()

for idx, feature in enumerate(selected_features.index):
    ax = axes[idx]
    ax.hist(df[feature], edgecolor='white')
    ax.set_ylabel(feature)
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel('Value')

for idx in range(len(selected_features.index), len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()
No description has been provided for this image

Visualizing Feature Correlation using scatter and hexbin.¶

Here, the scatter and hexbin diagrams are drawn for the chosen features in the previous part.

With a hexbin diagram, we can see the density of similar data.

In [12]:
sns.set(rc={'figure.figsize': (15, 6)})

num_features = len(selected_features.index)
num_cols = 1 
num_rows = (num_features + num_cols - 1) // num_cols
fig, axes = plt.subplots(num_rows, num_cols * 2, figsize=(15, 6 * num_rows))  # Adjust size to fit plots
axes = axes.flatten()

for idx, feature in enumerate(selected_features.index):
    # Scatter plot
    ax_scatter = axes[2 * idx]
    sns.scatterplot(x=feature, y=TARGET_COLUMN, data=df, ax=ax_scatter)
    ax_scatter.set_title(f'Scatter Plot')
    ax_scatter.set_xlabel(feature)
    ax_scatter.set_ylabel(TARGET_COLUMN)
    
    # Hexbin plot
    ax_hexbin = axes[2 * idx + 1]
    hb = ax_hexbin.hexbin(df[feature], df[TARGET_COLUMN], gridsize=50, cmap='Greens')
    cb = fig.colorbar(hb, ax=ax_hexbin)
    cb.set_label('Counts')
    ax_hexbin.set_title(f'Hexbin Plot')
    ax_hexbin.set_xlabel(feature)
    ax_hexbin.set_ylabel(TARGET_COLUMN)

for idx in range(num_features * num_cols * 2, len(axes)):
    fig.delaxes(axes[idx])

plt.tight_layout()
plt.show()
No description has been provided for this image

Other insightful investigations¶

In [13]:
def box_plot(df, selected_features, target_col=TARGET_COLUMN):
    for feature in selected_features.index.to_list():
        sns.boxplot(x=target_col, y=feature, data=df)
        plt.title(f'Box Plot: {feature} vs {target_col}')
        plt.show()
In [14]:
sns.set(rc={'figure.figsize': (14, 6)})
box_plot(df, selected_features)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Preprocessing Dataset¶

Some techniques to solve missing values problem¶

  • Imputation
    All the missing values are replaced by a substitution. A substitution could be:

    • Mean: A simple solution, but outliers can affect it negatively.
    • Median: This is often a more robust choice as it is not affected by outliers.
    • Mode: Preferred for categorical data where mean and median are not defined.
    • Random Fill: Selected values should be between the minimum and maximum in the column.
    • Prediction: Missing values are predicted based on the properties of the rows.
  • Dropping
    We can either drop some columns or rows.

    • Dropping Columns:
      • The entire column is removed, which may lead to a loss of valuable data. This method is more justifiable if a large portion of the column is missing values, as there are fewer reliable values to impute missing data with good precision.
    • Dropping Rows:
      • This is better applied to rows with a majority of missing values to avoid introducing bias or inaccuracies into the dataset. However, this method can result in the loss of potentially important information, so it should be used cautiously.

Handling Missing Values in Dataframe¶

In [15]:
missing_values(df)
Out[15]:
Missing Percentage
Unnamed: 0 0 0.000000
ID 0 0.000000
Year_Birth 0 0.000000
Education 0 0.000000
Marital_Status 0 0.000000
Income 223 0.099554
Kidhome 0 0.000000
Teenhome 0 0.000000
Dt_Customer 0 0.000000
Recency 0 0.000000
MntCoffee 205 0.091518
MntFruits 0 0.000000
MntMeatProducts 0 0.000000
MntFishProducts 0 0.000000
MntSweetProducts 0 0.000000
MntGoldProds 13 0.005804
NumWebVisitsMonth 200 0.089286
Complain 0 0.000000
NumPurchases 0 0.000000
UsedCampaignOffer 0 0.000000
  • Handling Invalid Values
    • First, delete invalid values from columns, such as negative values in countable features like Year_Birth, Income, or Kidhome. This step ensures the integrity of the data, as negative values in these contexts are typically nonsensical or indicative of errors. After removing these values, the remaining data can be further processed using imputation or dropping techniques as needed.
In [16]:
pNegCols = ["UsedCampaignOffer", "NumPurchases", "Complain", "NumWebVisitsMonth", "MntGoldProds",
            "MntSweetProducts", "MntFishProducts", "MntMeatProducts", "MntFruits", "MntCoffee",
            "Recency", "Teenhome", "Kidhome", "Income", "Year_Birth", "ID"]

df[pNegCols] = np.where(df[pNegCols] < 0, np.nan, df[pNegCols])
  • Filling in Missing Values
    • After handling invalid values, use the median to fill in the rest of the missing values. This method is preferred because the median is less affected by outliers and can provide a more accurate representation of the central tendency of the data, especially in the presence of skewed distributions.
In [17]:
def fillna_with_median(df):
    df.fillna(df.median(numeric_only=True), inplace=True)
    return df
def fill_with_mode(df):
    mode_values = df.mode(numeric_only=True).iloc[0]
    df.fillna(mode_values, inplace=True)
In [18]:
fillna_with_median(df)

missing_values(df)
Out[18]:
Missing Percentage
Unnamed: 0 0 0.0
ID 0 0.0
Year_Birth 0 0.0
Education 0 0.0
Marital_Status 0 0.0
Income 0 0.0
Kidhome 0 0.0
Teenhome 0 0.0
Dt_Customer 0 0.0
Recency 0 0.0
MntCoffee 0 0.0
MntFruits 0 0.0
MntMeatProducts 0 0.0
MntFishProducts 0 0.0
MntSweetProducts 0 0.0
MntGoldProds 0 0.0
NumWebVisitsMonth 0 0.0
Complain 0 0.0
NumPurchases 0 0.0
UsedCampaignOffer 0 0.0
  • Advanced Imputation Techniques
    • For rows containing more than a certain number of missing values (e.g., more than two NaN values), consider deleting those rows if they represent a small proportion of the dataset. After removing these rows, apply techniques such as KNNImputer to fill the remaining missing values. KNNImputer uses the nearest neighbors' values to impute missing data, making it a sophisticated method that leverages the relationships between features.

Normalization and Standardization in Numerical Features¶

These techniques aim to bring the values of different features onto a similar scale.

  • Normalization:
    • It scales the values of a feature to a specific range, often between 0 and 1.
    • The normalization formula for a numerical feature (X) is given by: $$ Xnorm = \frac {X-Xmin} {Xmax-Xmin} $$

where $X$ is the original value of the feature, $Xmin$ is the minimum value in the feature, and $Xmax$ is the maximum value in the feature.

  • Normalization is useful when the features have different ranges, and algorithms like neural networks or k-nearest neighbors may perform better when the input features are within a consistent scale.

  • Standardization:

    • transforms the data to have a mean of 0 and a standard deviation of 1. It makes the distribution of each feature more interpretable and facilitates comparisons between different features. $$ Z = \frac {X-Xmin} {Xstd} $$ where $X$ is the original value of the feature, $Xmin$ is the minimum value in the feature, and $Xstd$ is the standard deviation.
    • Standardization is particularly useful when the features have different units or when features follow different distributions. It helps algorithms that rely on the assumption of a normal distribution.
In [19]:
def standardization(df, exclude_cols: list = []):
    numeric_cols = df.select_dtypes(include="number")
    df[numeric_cols.columns] = StandardScaler().fit_transform(numeric_cols)
    df[exclude_cols] = numeric_cols[exclude_cols]
    return df

def normalization(df, exclude_cols: list = []):
    numeric_cols = df.select_dtypes(include="number")
    df[numeric_cols.columns] = MinMaxScaler().fit_transform(numeric_cols)
    df[exclude_cols] = numeric_cols[exclude_cols]
    return df

def plot_histogram(df):
    df.hist(bins=20, figsize=(20,15))
    plt.show()

Normalizing or standardizing is beneficial as the numerical features in this dataset have different scales.

In [20]:
plot_histogram(df)
No description has been provided for this image
In [21]:
df = normalization(df, ['NumPurchases'])

plot_histogram(df)
No description has been provided for this image
In [22]:
df = standardization(df, ['NumPurchases'])
plot_histogram(df)
No description has been provided for this image
In [23]:
df.describe()
Out[23]:
Unnamed: 0 ID Year_Birth Income Kidhome Teenhome Recency MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth Complain NumPurchases UsedCampaignOffer
count 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2240.000000 2.240000e+03
mean -1.015061e-16 -9.198991e-17 4.452787e-16 -3.552714e-16 9.992007e-17 -3.172066e-18 1.292617e-16 -5.709718e-17 -6.819941e-17 -8.564578e-17 -1.015061e-16 2.537653e-17 9.198991e-17 4.440892e-17 -3.053113e-17 14.862054 3.806479e-17
std 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 7.677173 1.000223e+00
min -1.731278e+00 -1.722818e+00 -6.326960e+00 -2.053225e+00 -8.237017e-01 -9.298944e-01 -1.696001e+00 -9.038860e-01 -6.614492e-01 -7.398135e-01 -6.870680e-01 -6.557331e-01 -8.449660e-01 -2.306859e+00 -9.728167e-02 0.000000 -6.110569e-01
25% -8.656389e-01 -8.514982e-01 -8.184192e-01 -6.304878e-01 -8.237017e-01 -9.298944e-01 -8.671566e-01 -8.204773e-01 -6.363012e-01 -6.689119e-01 -6.321399e-01 -6.315032e-01 -6.710752e-01 -5.939679e-01 -9.728167e-02 8.000000 -6.110569e-01
50% -9.616873e-17 -4.117757e-02 9.967091e-02 -3.448418e-02 -8.237017e-01 -9.298944e-01 -3.777284e-03 -3.570960e-01 -4.602650e-01 -4.429132e-01 -4.673554e-01 -4.618937e-01 -3.812572e-01 2.624776e-01 -9.728167e-02 15.000000 -6.110569e-01
75% 8.656389e-01 8.735813e-01 6.839101e-01 5.852568e-01 1.034397e+00 9.069340e-01 8.596020e-01 5.140609e-01 1.684356e-01 2.882592e-01 2.284015e-01 1.438543e-01 2.370211e-01 6.907003e-01 -9.728167e-02 21.000000 1.636509e+00
max 1.731278e+00 1.724876e+00 2.269702e+00 2.535543e+01 2.892495e+00 2.743762e+00 1.722981e+00 3.708303e+00 4.343008e+00 6.904261e+00 4.055064e+00 5.716737e+00 6.149307e+00 6.257596e+00 1.027943e+01 44.000000 1.636509e+00

Encoding¶

We should encode the categorical features.

  • One-Hot Encoding:

    • Creating a new feature for each category.
    • This method is appropriate when the categories do not have an intrinsic order.
    • It is particularly useful for algorithms that utilize the distance between data points, such as KNN.
    • For example, if we have a feature with 3 categories, we can encode them as follows:
      • Category 1: 1, 0, 0
      • Category 2: 0, 1, 0
      • Category 3: 0, 0, 1
  • Label Encoding:

    • Assigning a numerical label to each category.
    • This method is suitable when the categories have an order.
    • It can be useful for tree-based models that can leverage the ordinal nature of the data.
    • For example, if we have a feature with 3 categories, we can encode them as follows:
      • Category 1: 0
      • Category 2: 1
      • Category 3: 2
  • Binary Encoding:

    • Encoding the categories using binary numbers.
    • This method is suitable when the categories do not have an order.
    • It reduces the dimensionality compared to One-Hot Encoding.
    • For example, if we have a feature with 3 categories, we can encode them as follows:
      • Category 1: 00
      • Category 2: 01
      • Category 3: 10
  • Frequency Encoding:

    • Encoding the categories using the frequency of each category.
    • This method assigns a value to each category based on how often it appears in the dataset.
    • It is useful when the relative frequency of categories carries meaningful information.
    • For example, if we have a feature with 3 categories, we can encode them as follows:
      • Category 1: 0.5
      • Category 2: 0.25
      • Category 3: 0.25
  • Target Encoding:

    • Encoding the categories using the mean of the target variable for each category.
    • This method is useful in predictive modeling, especially when there is a relationship between the category and the target variable.
    • For example, if we have a feature with 3 categories, we can encode them as follows:
      • Category 1: 0.5
      • Category 2: 0.25
      • Category 3: 0.75
In [24]:
def handle_non_numeric_columns_label_encoding(df):
    label_encoder = LabelEncoder()
    for col in df.select_dtypes(exclude=['number']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    return df

df = handle_non_numeric_columns_label_encoding(df)
df.describe()
Out[24]:
Unnamed: 0 ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth Complain NumPurchases UsedCampaignOffer
count 2.240000e+03 2.240000e+03 2.240000e+03 2240.000000 2240.000000 2.240000e+03 2.240000e+03 2.240000e+03 2240.000000 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2.240000e+03 2240.000000 2.240000e+03
mean -1.015061e-16 -9.198991e-17 4.452787e-16 2.393750 3.729911 -3.552714e-16 9.992007e-17 -3.172066e-18 327.875446 1.292617e-16 -5.709718e-17 -6.819941e-17 -8.564578e-17 -1.015061e-16 2.537653e-17 9.198991e-17 4.440892e-17 -3.053113e-17 14.862054 3.806479e-17
std 1.000223e+00 1.000223e+00 1.000223e+00 1.124797 1.076277 1.000223e+00 1.000223e+00 1.000223e+00 190.165575 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 1.000223e+00 7.677173 1.000223e+00
min -1.731278e+00 -1.722818e+00 -6.326960e+00 0.000000 0.000000 -2.053225e+00 -8.237017e-01 -9.298944e-01 0.000000 -1.696001e+00 -9.038860e-01 -6.614492e-01 -7.398135e-01 -6.870680e-01 -6.557331e-01 -8.449660e-01 -2.306859e+00 -9.728167e-02 0.000000 -6.110569e-01
25% -8.656389e-01 -8.514982e-01 -8.184192e-01 2.000000 3.000000 -6.304878e-01 -8.237017e-01 -9.298944e-01 163.750000 -8.671566e-01 -8.204773e-01 -6.363012e-01 -6.689119e-01 -6.321399e-01 -6.315032e-01 -6.710752e-01 -5.939679e-01 -9.728167e-02 8.000000 -6.110569e-01
50% -9.616873e-17 -4.117757e-02 9.967091e-02 2.000000 4.000000 -3.448418e-02 -8.237017e-01 -9.298944e-01 326.000000 -3.777284e-03 -3.570960e-01 -4.602650e-01 -4.429132e-01 -4.673554e-01 -4.618937e-01 -3.812572e-01 2.624776e-01 -9.728167e-02 15.000000 -6.110569e-01
75% 8.656389e-01 8.735813e-01 6.839101e-01 3.000000 5.000000 5.852568e-01 1.034397e+00 9.069340e-01 485.000000 8.596020e-01 5.140609e-01 1.684356e-01 2.882592e-01 2.284015e-01 1.438543e-01 2.370211e-01 6.907003e-01 -9.728167e-02 21.000000 1.636509e+00
max 1.731278e+00 1.724876e+00 2.269702e+00 4.000000 7.000000 2.535543e+01 2.892495e+00 2.743762e+00 662.000000 1.722981e+00 3.708303e+00 4.343008e+00 6.904261e+00 4.055064e+00 5.716737e+00 6.149307e+00 6.257596e+00 1.027943e+01 44.000000 1.636509e+00

Feasibility of Column Deletion¶

Columns with low correlation to the target variable can be removed because they likely provide little predictive value.

In [25]:
def remove_low_correlation_columns(df, threshold=0.2):
    correlation_matrix = df.corr()
    correlations_with_target = correlation_matrix['NumPurchases']
    correlation_threshold = 0.2
    low_correlation_features = correlations_with_target[abs(correlations_with_target) < correlation_threshold].index
    df.drop(low_correlation_features, axis=1, inplace=True)

remove_low_correlation_columns(df)
plot_correlation_heatmap(df)
No description has been provided for this image

Splitting the dataset into train and test sets¶

Some common percentages for splitting the dataset into train and test sets:¶

  • train ratio: $90\%$ , test ration: $10\%$
  • train ratio: $80\%$ , test ration: $20\%$
  • train ratio: $70\%$ , test ration: $30\%$
  • train ratio: $60\%$ , test ration: $40\%$

Some ways to split data into training and test sets:¶

  • Randomly split the dataset into train and test sets

    • This method is the most common and involves shuffling the dataset randomly before dividing it into train and test sets.
    • A potential issue with this approach is that the resulting train and test sets may not have the same distribution, mainly if the dataset is not large enough, affecting the model's generalizability.
  • Split the dataset based on time

    • This method is beneficial for time series data, where the order of the data points matters.
    • The training set consists of earlier periods, and the test set consists of later periods.
    • This approach prevents future and past data leakage, which can occur if data is randomly split without considering time order.
  • Split the dataset based on the target

    • This method is helpful in imbalanced datasets, where certain classes are underrepresented.
    • Each class is proportionally represented in both the train and test sets helps the model generalize better across all classes.
    • Techniques such as stratified sampling can be used to maintain the distribution of the target variable in both subsets.

some of the most commonly used library methods for random dataset splitting¶

Method Library
train_test_split scikit-learn
randn numpy
sample pandas
In [26]:
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
    data = df[df.columns.difference([target_column])]
    outcome_data = df[target_column]

    data_train, data_test, outcome_train, outcome_test = train_test_split(
        data, outcome_data, train_size=train_percent, random_state=random_state)

    return data_train, data_test, outcome_train, outcome_test

x_train, x_test, y_train, y_test = split_data(df)

The random_state is used as a seed for a random number generator and makes the dataset reproducible.

The used ration is as bellow:

  • train ratio: $80\%$ , test ration: $20\%$

Validation Set¶

A validation set is utilized to assess the performance of a model during training and to prevent overfitting.
This set is crucial for tuning hyperparameters and making adjustments to improve the model’s performance, ensuring that the model does not just perform well on the training data but also generalizes effectively to new, unseen data.

K-Fold Cross Validation¶

It is a technique to partition a dataset into $k$ equally sized folds.
The model is trained and evaluated $k$ times, each using a different fold as the test set and the remaining $k-1$ folds as the training set.
This process helps assess the model's performance across various subsets of the data, ensuring that the model's performance is consistent across different subsets of the data.

1. Partitioning the Data:

  • The dataset is divided into $k$ equally sized folds.
  • Each fold serves as a test set while the remaining $k-1$ folds are used for training.

2. Training and Evaluation:

  • The model is trained on the training set (comprising $k-1$ folds).
  • The trained model is then evaluated on the test set (the remaining fold).

3. Iteration:

  • Steps 1 and 2 are repeated $k$ times, with each of the folds being used exactly once as the test set.

4. Performance Metrics:

  • The performance metrics (e.g., accuracy, precision, recall) from each iteration are averaged to provide a more robust evaluation of the model.

This technique is also commonly used for hyperparameter tuning, aiding in the selection of the optimal set of hyperparameters that generalize well to various data samples.

Linear Regression¶

Main form of simple linear regression function: $$f(x) = \alpha x + \beta$$

here we want to find the bias ($\alpha$) and slope($\beta$) by minimizing the derivation of the Residual Sum of Squares (RSS) function:

  • step 1: Compute RSS of the training data

$$ RSS = \Sigma (y_i - (\hat{\beta} + \hat{\alpha} * x_i) )^2 $$

  • step 2: Compute the derivatives of the RSS function in terms of $\alpha$ and $\beta$, and set them equal to 0 to find the desired parameters

$$ \frac{\partial RSS}{\partial \beta} = \Sigma (-f(x_i) + \hat{\beta} + \hat{\alpha} * x_i) = 0$$ $$ \to \beta = \hat{y} - \hat{\alpha} \hat{x} \to (1)$$

$$ \frac{\partial RSS}{\partial \alpha} = \Sigma (-2 x_i y_i + 2 \hat{\beta} x_i + 2\hat{\alpha} x_i ^ 2) = 0 \to (2)$$

$$ (1) , (2) \to \hat{\alpha} = \frac{\Sigma{(x_i - \hat{x})(y_i - \hat{y})}}{\Sigma{(x_i - \hat{x})^2}} $$ $$ \hat{\beta} = y - \hat{a} x$$

Using the formula provided earlier, the following function is implemented to calculate the parameters of a simple linear regression model.

$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$

$\alpha$ = $\frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$

In [27]:
selected_features = select_features_by_correlation(df, TARGET_COLUMN, 0.25, 0.477)
display(selected_features)
MntCoffee          0.678082
MntMeatProducts    0.554229
Income             0.535685
MntGoldProds       0.490752
Name: NumPurchases, dtype: float64

Features with a higher correlation to the target column are recommended for use in Linear Regression. This is because features with stronger correlations are typically more predictive, providing a more reliable basis for the model.

In [28]:
selected_features.describe()
Out[28]:
count    4.000000
mean     0.564687
std      0.080157
min      0.490752
25%      0.524452
50%      0.544957
75%      0.585192
max      0.678082
Name: NumPurchases, dtype: float64
In [29]:
def simple_linear_regression(input_feature, output):
    sum_x = np.sum(input_feature)
    sum_y = np.sum(output)

    sum_xy = np.sum(input_feature * output)
    sum_xx = np.sum(input_feature * input_feature)

    slope = (sum_xy - (sum_x * sum_y) / len(input_feature)) / (sum_xx - (sum_x * sum_x) / len(input_feature))
    intercept = sum_y / len(input_feature) - slope * sum_x / len(input_feature)

    return (intercept, slope)

The function bellow predicts values for the given data using the calculated intercept and slope. The prediction is based on the formula:

$\hat{y}$ = $\beta_0$ + $\beta_1 x$

In [30]:
def get_regression_predictions(input_feature, bias, slope):
    predicted_values = bias + slope * input_feature
    return predicted_values

For model evaluation, Root Mean Square Error (RMSE) is used.
RMSE is the square root of the mean of the squared differences between the residuals, and the residuals are just a fancy word for the difference between the predicted output and the true output.

$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$

In [31]:
def get_root_mean_square_error(predicted_values, outputs):
    squared_diffs = [(predicted - actual)**2 for predicted, actual in zip(predicted_values, outputs)]
    mean_squared_diff = sum(squared_diffs) / len(squared_diffs)
    rmse = mean_squared_diff**0.5
    return rmse

The RMSE has no bounds. Thus, it becomes challenging to determine whether a particular RMSE value is considered good or bad without any reference point.
Instead, we use the R2 score. The R2 score is calculated by comparing the sum of the squared differences between the actual and predicted values of the dependent variable to the total sum of squared differences between the actual and mean values of the dependent variable.

The R2 score is formulated as below:

$$R^2 = 1 - \frac{SSres}{SStot} = 1 - \frac{\sum_{i=1}^{n} (y_{i,true} - y_{i,pred})^2}{\sum_{i=1}^{n} (y_{i,true} - \bar{y}_{true})^2} $$

In [32]:
def get_r2_score(predicted_values, outputs):
    residuals = outputs - predicted_values
    SSres = np.sum(residuals ** 2)
    SStot = np.sum((outputs - np.mean(outputs)) ** 2)
    R2_score = 1 - SSres / SStot
    return R2_score

Now calculate the fitness of the model.

$\beta$ = $\frac{\sum_{i=1}^{n} (x_iy_i) - \frac{1}{n}\sum_{i=1}^{n}x_i\sum_{i=1}^{n}y_i}{\sum_{i=1}^{n}x_i^2 - \frac{1}{n}(\sum_{i=1}^{n}x_i)^2}$

$\alpha = \frac{1}{n}\sum_{i=1}^{n}y_i - \beta \frac{1}{n}\sum_{i=1}^{n}x_i$

$\hat{y}$ = $4\alpha + \beta x$

$RMSE$ = $\sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$

$R2$ = $1 - \frac{\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}$

In [33]:
def split_data(df: pd.DataFrame, target_column: str = TARGET_COLUMN, train_percent: float = 0.8, random_state: int = 1):
    data = df[df.columns.difference([target_column])]
    outcome_data = df[target_column]

    data_train, data_test, outcome_train, outcome_test = train_test_split(
        data, outcome_data, train_size=train_percent, random_state=random_state)

    return data_train, data_test, outcome_train, outcome_test

x_train, x_test, y_train, y_test = split_data(df)
In [34]:
def log_parameters(feature, RMSE, R2_score, intercept, slope):
    print(f"Feature  :{feature}")
    print(f"RMSE     :{RMSE:0.3f}")
    print(f"R2 Score :{R2_score:0.3f}")
    print(f"y = {intercept:0.3f} * x + {slope:0.3f}")
    print("--------------------------------------------")

def plot_regression_line(x_test, y_test, feature, predicted_values):
    plt.figure(figsize=(6, 4))
    plt.scatter(x_test[feature], y_test, label="Actual Data")
    plt.plot(x_test[feature], predicted_values, color='red', label="Regression Line")
    plt.title(f"Regression Line for {feature}")
    plt.xlabel(feature)
    plt.ylabel(TARGET_COLUMN)
    plt.legend()
    plt.show()
In [35]:
for feature in selected_features.index:
    intercept, slope = simple_linear_regression(x_train[feature], y_train)
    predicted_values = get_regression_predictions(x_test[feature], intercept, slope)
    plot_regression_line(x_test, y_test, feature, predicted_values)
    RMSE = get_root_mean_square_error(predicted_values, y_test)
    R2_score = get_r2_score(predicted_values, y_test)
    log_parameters(feature, RMSE, R2_score, intercept, slope)
No description has been provided for this image
Feature  :MntCoffee
RMSE     :5.502
R2 Score :0.439
y = 14.864 * x + 5.264
--------------------------------------------
No description has been provided for this image
Feature  :MntMeatProducts
RMSE     :6.097
R2 Score :0.311
y = 14.889 * x + 4.238
--------------------------------------------
No description has been provided for this image
Feature  :Income
RMSE     :5.720
R2 Score :0.393
y = 14.864 * x + 3.812
--------------------------------------------
No description has been provided for this image
Feature  :MntGoldProds
RMSE     :6.257
R2 Score :0.274
y = 14.943 * x + 3.759
--------------------------------------------

Multiple Regression¶

Multiple regression is a statistical technique that aims to model the relationship between a dependent variable and two or more independent variables.

Multiple regression with n independent variables is expressed as follows:

$$f(x) = \beta _{0} + \beta_{1} x_{1} + \beta_{2} x_{2} + \beta_{3} x_{3} + \beta_{4} x_{4} + ... + \beta_{n} x_{n} + c $$

To optimize the model for accurate predictions, multiple regression commonly employs iterative algorithms such as gradient descent.

The main goal of the optimization process is to make our predictions as close as possible to the actual values. We measure the prediction error using a cost function, usually denoted as $J(\beta)$.

$$ J(\beta)= \frac {1}{2m} Σ_{i=0}^{m-1}(y_i - (\hat \beta _{0} + \hat \beta_{1} x_{1} + \hat \beta_{2} x_{2} + \hat \beta_{3} x_{3} + \hat \beta_{4} x_{4} + ... + \hat \beta_{n} x_{n}) )^2 $$

Gradient descent iteratively adjusts the coefficients $(\beta_i)$ to minimize the cost function. The update rule for each coefficient is:

$$\beta_{i} = \beta _ {i} - \alpha \frac {∂J(\beta)}{∂\beta_{i}}$$

$$ \frac {∂J(\beta)}{∂\beta_{i}} = \frac {1}{m}Σ_{j=0}^{m-1}(y_j - (\hat \beta _{0} + \hat \beta_{1} x_{j1} + \hat \beta_{2} x_{j2} + \hat \beta_{3} x_{j3} + \hat \beta_{4} x_{j4} + ... + \hat \beta_{n} x_{jn})) x_{ji} $$

In [36]:
def predict_output(feature_matrix, weights, bias):
    predictions = np.dot(feature_matrix, weights)
    predictions += bias
    return predictions

Derivative computation

As we saw, the cost function is the sum over the data points of the squared difference between an observed output and a predicted output.

Since the derivative of a sum is the sum of the derivatives, we can compute the derivative for a single data point and then sum over data points. We can write the squared difference between the observed output and predicted output for a single point as follows:

$$ (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} ))^2 $$

With n feautures and a const , So the derivative will be :

$$ 2 * (output - (const* w _{0} + [feature_1] * w_{1} + ...+ [feature_n] * w_{n} )) $$

The term inside the paranethesis is just the error (difference between prediction and output). So we can re-write this as:

$$2 * error*[feature_i] $$

That is, the derivative for the weight for feature i is the sum (over data points) of 2 times the product of the error and the feature itself. In the case of the constant then this is just twice the sum of the errors!

Recall that twice the sum of the product of two vectors is just twice the dot product of the two vectors. Therefore the derivative for the weight for feature_i is just two times the dot product between the values of feature_i and the current errors.

In [37]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(errors, feature)
    return derivative

Gradient Descent

Given a starting point we update the current weights by moving in the negative gradient direction. Recall that the gradient is the direction of increase and therefore the negative gradient is the direction of decrease and we're trying to minimize a cost function.

The amount by which we move in the negative gradient direction is called the step size. We stop when we are sufficiently close to the optimum. We define this by requiring that the magnitude (length) of the gradient vector to be smaller than a fixed tolerance.

In [38]:
def regression_gradient_descent(feature_matrix, outputs, initial_weights, bias, step_size, tolerance):
    weights = np.array(initial_weights)
    converged = False
    while not converged:
        predictions = predict_output(feature_matrix, weights, bias)
        errors = outputs - predictions

        gradient = - feature_derivative(feature_matrix.T, errors)
        weights -= step_size * gradient

        bias_gradient = -2 * np.sum(errors)
        bias -= step_size * bias_gradient

        if np.linalg.norm(gradient) < tolerance:
            converged = True

    return weights, bias
In [39]:
def normalize_features(chosen_features, data_frame):
    for feature in chosen_features:
        data_frame.loc[:, feature] = (data_frame[feature] - data_frame[feature].mean()) / data_frame[feature].std()
    return data_frame


def n_feature_regression(chosen_feature_matrix, target_matrix, keywords):
    initial_weights = keywords['initial_weights']
    step_size = keywords['step_size']
    tolerance = keywords['tolerance']
    bias = keywords['bias']

    weights, bias = regression_gradient_descent(chosen_feature_matrix, target_matrix, initial_weights, bias, step_size,
                                                tolerance)

    return weights, bias

def get_weights_and_bias(chosen_features):
    """
    Computes the weights and bias for a general n feature model.
    :param chosen_features:  list of features to perform multiple regression on
    :return: chosen_feature_matrix, computed weights and bias via regression
    """

    keywords = {
        'initial_weights': np.array([.5]*len(chosen_features)),
        'step_size': 1.e-4,
        'tolerance': 1.e-10,
        'bias': 0
    }

    chosen_feature_dataframe = x_train[chosen_features]
    chosen_feature_matrix = chosen_feature_dataframe.to_numpy()

    target_column = y_train
    target_matrix = target_column.to_numpy()

    train_weights, bias = n_feature_regression(chosen_feature_matrix, target_matrix, keywords)
    return chosen_feature_matrix, train_weights, bias

Two Feature Regression

In this part, we choose two features and implement multiple regression.

In [40]:
chosen_features = selected_features.index[:2]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)
In [41]:
predictions = predict_output(x_test[chosen_features], train_weights, bias)
In [42]:
R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)

print("RMSE     :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE     : 5.374576615233149
R2 Score : 0.4643213845425165
--------------------------------------------

Three Feature Regression

In [43]:
chosen_features = selected_features.index[:3]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)

predictions = predict_output(x_test[chosen_features], train_weights, bias)

R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)

print("RMSE     :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE     : 5.199870341008263
R2 Score : 0.49858095852511164
--------------------------------------------

Five Feature Regression

In [44]:
chosen_features = selected_features.index[:5]
chosen_feature_matrix, train_weights, bias = get_weights_and_bias(chosen_features)

predictions = predict_output(x_test[chosen_features], train_weights, bias)

R2_score = get_r2_score(predictions, y_test)
RMSE = get_root_mean_square_error(predictions, y_test)

print("RMSE     :", RMSE)
print("R2 Score :", R2_score)
print("--------------------------------------------")
RMSE     : 5.0354824739760895
R2 Score : 0.529783385182038
--------------------------------------------

Classification¶

Here, I created a new column named PurchaseRate based on NumPurchases.

In [45]:
df['PurchaseRate'] = np.where(df['NumPurchases'] > df['NumPurchases'].median(), 'HIGH', 'LOW')
median_num_purchases = df['NumPurchases'].median()
df.drop(columns=['NumPurchases'], inplace=True)
TARGET_COLUMN = 'PurchaseRate'
df
Out[45]:
Income Kidhome MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth UsedCampaignOffer PurchaseRate
0 0.244835 -0.823702 1.057762 1.551577 1.679702 2.462147 1.476500 0.855299 0.262478 1.636509 HIGH
1 -0.241838 1.034397 -0.357096 -0.636301 -0.713225 -0.650449 -0.631503 -0.729039 -0.165745 -0.611057 LOW
2 0.800874 -0.823702 -0.357096 0.570804 -0.177032 1.345274 -0.146905 -0.033476 0.262478 -0.611057 HIGH
3 -1.054666 1.034397 -0.869905 -0.560857 -0.651187 -0.503974 -0.583043 -0.748360 0.262478 -0.611057 LOW
4 0.251231 1.034397 -0.369453 0.419916 -0.216914 0.155164 -0.001525 -0.555148 -0.165745 -0.611057 HIGH
... ... ... ... ... ... ... ... ... ... ... ...
2235 0.372136 -0.823702 1.286363 0.419916 0.066692 0.081926 2.203398 3.927370 -0.165745 -0.611057 HIGH
2236 0.487305 2.892495 0.350333 -0.661449 -0.606873 -0.687068 -0.655733 -0.690396 0.690700 1.636509 HIGH
2237 0.197092 -0.823702 1.901116 0.545656 0.221789 -0.101168 -0.364974 -0.381257 0.262478 1.636509 HIGH
2238 0.703160 -0.823702 0.418295 0.092992 0.208495 0.777683 0.071165 0.333627 0.262478 -0.611057 HIGH
2239 0.027413 1.034397 -0.644392 -0.586005 -0.469501 -0.650449 -0.631503 -0.439221 0.690700 1.636509 LOW

2240 rows × 11 columns

In [46]:
df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1709526377.py:1: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1709526377.py:1: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df['PurchaseRate'].replace({'LOW': 0, 'HIGH': 1}, inplace=True)
In [47]:
df
Out[47]:
Income Kidhome MntCoffee MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumWebVisitsMonth UsedCampaignOffer PurchaseRate
0 0.244835 -0.823702 1.057762 1.551577 1.679702 2.462147 1.476500 0.855299 0.262478 1.636509 1
1 -0.241838 1.034397 -0.357096 -0.636301 -0.713225 -0.650449 -0.631503 -0.729039 -0.165745 -0.611057 0
2 0.800874 -0.823702 -0.357096 0.570804 -0.177032 1.345274 -0.146905 -0.033476 0.262478 -0.611057 1
3 -1.054666 1.034397 -0.869905 -0.560857 -0.651187 -0.503974 -0.583043 -0.748360 0.262478 -0.611057 0
4 0.251231 1.034397 -0.369453 0.419916 -0.216914 0.155164 -0.001525 -0.555148 -0.165745 -0.611057 1
... ... ... ... ... ... ... ... ... ... ... ...
2235 0.372136 -0.823702 1.286363 0.419916 0.066692 0.081926 2.203398 3.927370 -0.165745 -0.611057 1
2236 0.487305 2.892495 0.350333 -0.661449 -0.606873 -0.687068 -0.655733 -0.690396 0.690700 1.636509 1
2237 0.197092 -0.823702 1.901116 0.545656 0.221789 -0.101168 -0.364974 -0.381257 0.262478 1.636509 1
2238 0.703160 -0.823702 0.418295 0.092992 0.208495 0.777683 0.071165 0.333627 0.262478 -0.611057 1
2239 0.027413 1.034397 -0.644392 -0.586005 -0.469501 -0.650449 -0.631503 -0.439221 0.690700 1.636509 0

2240 rows × 11 columns

splitting data to test and train¶

In [48]:
def split_data(dataframe: pd.DataFrame, outcome: str, train_percent: float = 0.7):
    ddata = dataframe.drop(columns=[outcome])
    odata = dataframe[outcome]

    split_data = train_test_split(ddata, odata, train_size=train_percent, random_state=1)
    dtrain, dtest, otrain, otest = split_data
    
    return dtrain, dtest, otrain, otest
x_train, x_test, y_train, y_test = split_data(df, 'PurchaseRate')
In [49]:
class Classifier:
    def __init__(self, model, x_train, y_train, x_test, y_test, params=None):
        self.model = model
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.params = params if params else {}
        self.set_params()

    def set_params(self):
        self.model_instance = self.model(**self.params)
        self.model_instance.fit(self.x_train, self.y_train)
        self.predictions = self.model_instance.predict(self.x_test)

    def accuracy_test(self) -> float:
        return metrics.accuracy_score(self.y_test, self.predictions)

    def accuracy_train(self) -> float:
        train_predict = self.model_instance.predict(self.x_train)
        return metrics.accuracy_score(self.y_train, train_predict)

    def confusion_matrix(self):
        matrix = metrics.confusion_matrix(self.y_test, self.predictions)
        matrix_disp = metrics.ConfusionMatrixDisplay(matrix)
        matrix_disp.plot(cmap='Blues')
        plt.grid(False)
        plt.title(f'{self.model.__name__} Confusion Matrix')
        plt.show()

    def log_grid_result(self, grid):
        print(f"- Best hyperparameters               : {grid.best_params_}")
        print(f"- Best model's train score (accuracy): {grid.best_score_:0.3f}")

    def grid_search(self, search_params, scoring='accuracy') -> tuple[float, GridSearchCV]:
        grid = GridSearchCV(self.model_instance, search_params, scoring=scoring)
        grid.fit(self.x_train, self.y_train)
        test_score = grid.score(self.x_test, self.y_test)
        print(f"- model's test accuracy              : {self.accuracy_test():0.3f}")
        print(f"- Test Score(accuracy)               : {test_score:0.3f}")
        self.log_grid_result(grid)
        print(f"- Best model's test score            : {test_score:0.3f}")
        # return test_score, grid
    def predict(self, x):
            return self.model.predict(x)

Decision Tree¶

A decision tree is a versatile tool in machine learning, working well for sorting things into groups or guessing values. It chops data into smaller chunks based on different traits, trying to keep things similar within each chunk.

It starts by looking at all the data and picks a trait that splits it into two groups that are as much alike as possible for the thing we care about. It keeps doing this for each smaller group until it hits specific rules, like how detailed the tree can get or how many examples are in each group.

For sorting things, it tries to find traits that give the most useful info or reduce the messiness in the groups. For guessing values, it looks for traits that get our guesses as close as possible to the real answers.

Once the tree's made, it can guess things about new data by following its branches from start to finish. Where it ends tells us our prediction. For sorting things, it might go with the most common group, and for guessing numbers, it could be an average.

These trees are great because they're simple to understand and display visually. They handle different types of data, like categories or numbers. But sometimes, they can get too detailed or struggle if there's noisy or extra stuff in the data. People have ways to fix this, like simplifying the tree or using groups of trees together to make better decisions.

Hyper-parameter Description
max_depth The maximum depth of the decision tree. A larger value of max_depth can capture more complex patterns in the data, but may also lead to overfitting. A smaller value of max_depth may lead to underfitting.
min_samples_split The minimum number of samples required to split an internal node.
min_samples_leaf The minimum number of samples required to be at a leaf node.
criterion The function used to measure the quality of a split. The two options available are gini and entropy.
splitter The strategy used to choose the split at each node. The two options here are random and best.
In [50]:
grid_s_params = {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": range(2, 9),
        "min_samples_split": range(2, 9),
        "min_samples_leaf": range(2, 9),
        "random_state": [54],
}

dtree_model = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test)
dtree_model.grid_search(grid_s_params)
- model's test accuracy              : 0.891
- Test Score(accuracy)               : 0.914
- Best hyperparameters               : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score            : 0.914

KNN¶

K-Nearest Neighbors(KNN), is a straightforward and efficient method used for sorting things into categories or guessing values.
It finds the closest K neighbors from the training data to a new item and decides its category or value based on either voting or averaging those neighbors' characteristics. KNN doesn't make any strict assumptions about how the data is spread out.

Choosing the right K value is crucial. If K is large, it can help smooth out any irregularities in the data but might make the model too simple. If K is small, it can capture intricate details but might overcomplicate the model.

Some settings include:

  • algorithm:
    • This specifies how to find those nearest neighbors.
    • The options include auto, ball_tree, kd_tree, and brute.
  • n_neighbors:
    • It's the number of neighbors considered when making a decision.
    • Higher values reduce noise but might oversimplify, while lower values can catch complex details but risk overfitting.
  • metric:
    • It's the measure of distance used to find these neighbors in the data.
In [51]:
grid_s_params = {
        "n_neighbors": range(2,20),
        "metric": ["euclidean", "manhattan", "minkowski"]
}
knear_model = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test)
knear_model.grid_search(grid_s_params)
- model's test accuracy              : 0.887
- Test Score(accuracy)               : 0.890
- Best hyperparameters               : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score            : 0.890

Logistic Regression¶

Logistic Regression serves as a supervised learning technique primarily employed for classification tasks. It operates by modeling the likelihood of a binary outcome (like 0 or 1) based on the input features. This model produces a probability score between 0 and 1, representing the chance of the binary outcome occurring. Logistic regression, being parametric, makes certain assumptions about the data's distribution. It's versatile, handling both categorical and numerical data.

The logistic regression equation is:

$$(P(y=1|X)) = \sigma(z) = \frac{1}{1 + e^{-z}}$$

where the z is: $$z = w_1x_1 + w_2x_2 + ... + w_nx_n + b$$ where each $w_i$ is the weight associated with the $i^{th}$ feature.

The model computes optimal coefficients that minimize the difference between predicted probabilities and actual labels in the training data. Predictions involve calculating the probability and applying a decision threshold.

Hyperparameter Description
Solver Chooses the optimization algorithm for coefficient optimization. Common solvers include lbfgs, liblinear, newton-cg, sag, and saga.
Penalty Dictates L1 or L2 regularization to curb overfitting. It picks between the two. L1 can aid feature selection by zeroing some coefficients, while L2 shrinks coefficients towards zero.
C Governs the regularization strength. A lower C means stronger regularization, preventing overfitting but potentially causing underfitting.
In [52]:
grid_s_params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "penalty": ["l2"],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
}
logreg_model = Classifier(LogisticRegression, x_train, y_train, x_test, y_test)
logreg_model.grid_search(grid_s_params)
- model's test accuracy              : 0.881
- Test Score(accuracy)               : 0.879
- Best hyperparameters               : {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
- Best model's train score (accuracy): 0.897
- Best model's test score            : 0.879

Confusion Matrix¶

In [53]:
dtree_model.confusion_matrix()
No description has been provided for this image
In [54]:
knear_model.confusion_matrix()
No description has been provided for this image
In [55]:
logreg_model.confusion_matrix()
No description has been provided for this image

Perform randomized search for each model¶

In [56]:
dt_model = DecisionTreeClassifier(
    max_depth=4, min_samples_split=8, 
    min_samples_leaf=2, random_state=54,
    splitter='best', criterion='gini'
)
dt_model.fit(x_train, y_train)

y_pred_dt = dt_model.predict(x_test)
In [57]:
# K-Nearest Neighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors=9, metric='euclidean')
#  weights='uniform', algorithm='kd_tree')
knn_model.fit(x_train, y_train)
y_pred_knn = knn_model.predict(x_test)
In [58]:
# Logistic Regression Classifier
logreg_model = LogisticRegression(C=1, penalty='l2', solver='newton-cg')
# max_iter=2000, random_state=42)
logreg_model.fit(x_train, y_train)
y_pred_logreg = logreg_model.predict(x_test)
In [59]:
models = {'Decision Tree': (dt_model, y_pred_dt),
          'KNN': (knn_model, y_pred_knn),
          'Logistic Regression': (logreg_model, y_pred_logreg)}
In [60]:
def plot_model_evaluation(models, X_test, y_test):
    fig, axes = plt.subplots(nrows=2, ncols=len(models), figsize=(20, 10))
    plt.subplots_adjust(hspace=0.5)

    for i, (model_name, (model, y_pred)) in enumerate(models.items()):
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        labels = ['Low', 'High']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0, i], xticklabels=labels, yticklabels=labels)
        axes[0, i].set_title(f'Confusion Matrix - {model_name}')

        # Classification Report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        sns.heatmap(pd.DataFrame(classification_rep).iloc[:-1, :].T, annot=True, cmap='Blues', ax=axes[1, i])
        axes[1, i].set_title(f'Classification Report - {model_name}')

        # Accuracy
        accuracy = accuracy_score(y_test, y_pred)
        axes[1, i].text(0.5, -0.2, f'Accuracy: {accuracy:.2%}', horizontalalignment='center',
                        verticalalignment='center', transform=axes[1, i].transAxes)

    plt.show()
In [61]:
plot_model_evaluation(models, x_test, y_test)
No description has been provided for this image
In [62]:
def hyper_param_comp(param_range, init_params, param):
    test_accs = []
    train_accs = []
    for i in param_range:
        init_params[param] = i
        classifier = RandomForestClassifier(criterion=init_params['criterion'], max_depth=init_params['max_depth'], n_estimators=init_params['n_estimators'])
        RF = Classifier(classifier, grid_search_params, X_train, y_train)
        test_accs.append(RF.calc_accuracy(X_test, y_test))
        train_accs.append(RF.train_accuracy)
        
    plt.plot(param_range, test_accs, color='blue', label='test')
    plt.plot(param_range, train_accs, color='red', label='train')
    plt.xlabel(param)
    plt.ylabel('Accuracy')
    plt.legend(loc="lower right")

GridSearchCV¶

GridSearchCV is a technique used in machine learning to fine-tune hyperparameters effectively, aiming to find the best combination for a model. It involves specifying various hyperparameters and testing different combinations to determine the optimal set.

Key Parameter Description
estimator Represents the model under consideration for parameter tuning.
param_grid A dictionary or list of dictionaries that outlines the hyperparameters and their potential values to explore.
cv Denotes the cross-validation strategy, determining how the dataset is split into training and validation sets.
scoring Evaluates and scores the model's performance against the validation set.
n_jobs Specifies the number of CPU cores used for parallel processing. Using n_jobs=-1 utilizes all available CPU cores for faster computation.
verbose Controls the level of detail in the output during the search process. Setting verbose=1 displays progress messages during the hyperparameter search.
In [63]:
dtree_grid_s_params = {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": range(2, 9),
        "min_samples_split": range(2, 9),
        "min_samples_leaf": range(2, 9),
        "random_state": [54],
}
print(f'Decision Tree')
dtree_model.grid_search(dtree_grid_s_params)

knear_grid_s_params = {
        "n_neighbors": range(2,20),
        "metric": ["euclidean", "manhattan", "minkowski"]
}
print(f'K Nearest Neighbors')
grid_res = knear_model.grid_search(knear_grid_s_params)

# print(f'Logistic reg model')
# grid_s_params = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     "penalty": ["l2"],
#     "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
# }
# grid_res = logreg_model.grid_search()
Decision Tree
- model's test accuracy              : 0.891
- Test Score(accuracy)               : 0.914
- Best hyperparameters               : {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'random_state': 54, 'splitter': 'best'}
- Best model's train score (accuracy): 0.919
- Best model's test score            : 0.914
K Nearest Neighbors
- model's test accuracy              : 0.887
- Test Score(accuracy)               : 0.890
- Best hyperparameters               : {'metric': 'euclidean', 'n_neighbors': 3}
- Best model's train score (accuracy): 0.913
- Best model's test score            : 0.890
In [64]:
dtree_params = {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 8, 'min_samples_split': 2, 'random_state': 1}
new_dtree_classifier = Classifier(DecisionTreeClassifier, x_train, y_train, x_test, y_test, dtree_params)
print(f'Decision Tree test:       {new_dtree_classifier.accuracy_test() * 100:.3f}%')
print(f'Decision Tree train:       {new_dtree_classifier.accuracy_train() * 100:.3f}%')
Decision Tree test:       91.220%
Decision Tree train:       92.921%
In [65]:
knear_params = {'n_neighbors': 9}
new_knear_classifier = Classifier(KNeighborsClassifier, x_train, y_train, x_test, y_test, knear_params)
print(f'knn test:       {new_knear_classifier.accuracy_test() * 100:.3f}%')
print(f'knn train:       {new_knear_classifier.accuracy_train() * 100:.3f}%')
knn test:       88.542%
knn train:       92.283%

Underfitting and Overfitting¶

Some key consepts:

  • Bias
    • Bias refers to how well a model captures the intricacies of the training data.
      • Higher bias indicates less precision in capturing complex patterns, as it might not consider all features, including noisy data.
      • Lower bias suggests better accuracy on the training data, but it might lead to overfitting if the model starts capturing noise.
    • Ideally, you want to balance bias to prevent both underfitting and overfitting.
  • Variance
    • Variance measures the model's performance on new, unseen data.
      • Lower variance indicates that the model generalizes well and accurately identifies patterns in the testing dataset.
      • High variance, however, implies that the model may be overfitting, capturing noise in the training data rather than general patterns.
  • Trade-off Between Bias and Variance
    • Achieving both low bias and low variance simultaneously is challenging.
    • Reducing one typically increases the other. It's crucial to find a balance to optimize model performance.

Overfitting

  • Overfitting occurs when a model becomes overly complex, fitting the training data too closely.
  • It captures noise instead of the actual patterns, leading to poor performance on new, unseen data.
  • Factors contributing to overfitting include too many features, excessive model complexity, or prolonged training.
  • To detect overfitting, compare performance on training and validation data.
  • A model that excels on training data but performs poorly on validation data is likely overfitting.
Technique Description
Regularization Adds a penalty to the loss function to discourage overly large weights.
Early Stopping Halts training when the model's performance on validation data plateaus.
Model Simplification Reduces complexity to prevent overfitting.

Underfitting

  • Underfitting happens when a model is too simplistic to capture underlying patterns in the data.
  • It results in poor performance on both training and new data.
  • Causes of underfitting include too few features, excessive simplicity, or insufficient training.
  • To identify underfitting, assess the model's performance on both training and validation data.
  • A model struggling with both is likely underfitting.
Technique Description
Add Features Includes more relevant features to capture data patterns.
Increase Model Complexity Uses more complex models that better capture the data.
Extend Training Duration Allows more time for the model to learn from the data.

Finding the right balance between model complexity and available data is crucial. A more intricate model may require more data to avoid overfitting.

In [66]:
print(f'Decision Tree:       {dtree_model.accuracy_train() * 100:.3f}%')
print(f'K Nearest Neighbors: {knear_model.accuracy_train() * 100:.3f}%')
Decision Tree:       100.000%
K Nearest Neighbors: 93.495%

Results

As we can see, the result on train data is a little better than the result in test data but in overal they are st the same reange.

Model Accuracy
Decision Tree 90.923%
K Nearest Neighbors 88.095%
Logistic Regression 87.649%

The result changes after changing the threshold for removing columns with low correlations, but the total change is insignificant.

In [67]:
plot_tree(dtree_model.model_instance, filled=True, feature_names=x_train.columns, class_names=['Low', 'High'])
Out[67]:
[Text(0.24743238369193502, 0.9772727272727273, 'MntMeatProducts <= -0.458\ngini = 0.5\nsamples = 1568\nvalue = [800, 768]\nclass = Low'),
 Text(0.12083214591051582, 0.9318181818181818, 'MntCoffee <= 0.057\ngini = 0.176\nsamples = 769\nvalue = [694, 75]\nclass = Low'),
 Text(0.11171273867198632, 0.8863636363636364, 'MntGoldProds <= 0.111\ngini = 0.079\nsamples = 724\nvalue = [694, 30]\nclass = Low'),
 Text(0.05927614705044172, 0.8409090909090909, 'MntSweetProducts <= 0.943\ngini = 0.04\nsamples = 685\nvalue = [671, 14]\nclass = Low'),
 Text(0.050156739811912224, 0.7954545454545454, 'MntCoffee <= -0.334\ngini = 0.035\nsamples = 683\nvalue = [671, 12]\nclass = Low'),
 Text(0.018238814477058992, 0.75, 'MntCoffee <= -0.414\ngini = 0.015\nsamples = 654\nvalue = [649, 5]\nclass = Low'),
 Text(0.009119407238529496, 0.7045454545454546, 'gini = 0.0\nsamples = 580\nvalue = [580, 0]\nclass = Low'),
 Text(0.027358221715588486, 0.7045454545454546, 'Income <= 0.444\ngini = 0.126\nsamples = 74\nvalue = [69, 5]\nclass = Low'),
 Text(0.018238814477058992, 0.6590909090909091, 'MntMeatProducts <= -0.569\ngini = 0.104\nsamples = 73\nvalue = [69, 4]\nclass = Low'),
 Text(0.009119407238529496, 0.6136363636363636, 'gini = 0.0\nsamples = 58\nvalue = [58, 0]\nclass = Low'),
 Text(0.027358221715588486, 0.6136363636363636, 'MntMeatProducts <= -0.529\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'),
 Text(0.018238814477058992, 0.5681818181818182, 'MntCoffee <= -0.394\ngini = 0.5\nsamples = 8\nvalue = [4, 4]\nclass = Low'),
 Text(0.009119407238529496, 0.5227272727272727, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.027358221715588486, 0.5227272727272727, 'MntFishProducts <= -0.66\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = Low'),
 Text(0.018238814477058992, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.036477628954117984, 0.4772727272727273, 'MntCoffee <= -0.379\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'),
 Text(0.027358221715588486, 0.4318181818181818, 'MntMeatProducts <= -0.547\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.018238814477058992, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.036477628954117984, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.045597036192647475, 0.4318181818181818, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'),
 Text(0.036477628954117984, 0.5681818181818182, 'gini = 0.0\nsamples = 7\nvalue = [7, 0]\nclass = Low'),
 Text(0.036477628954117984, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.08207466514676547, 0.75, 'Kidhome <= 0.105\ngini = 0.366\nsamples = 29\nvalue = [22, 7]\nclass = Low'),
 Text(0.06383585066970647, 0.7045454545454546, 'MntSweetProducts <= -0.353\ngini = 0.111\nsamples = 17\nvalue = [16, 1]\nclass = Low'),
 Text(0.05471644343117697, 0.6590909090909091, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'),
 Text(0.07295525790823597, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.10031347962382445, 0.7045454545454546, 'MntSweetProducts <= -0.559\ngini = 0.5\nsamples = 12\nvalue = [6, 6]\nclass = Low'),
 Text(0.09119407238529495, 0.6590909090909091, 'MntMeatProducts <= -0.656\ngini = 0.375\nsamples = 8\nvalue = [2, 6]\nclass = High'),
 Text(0.08207466514676547, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.10031347962382445, 0.6136363636363636, 'MntCoffee <= -0.274\ngini = 0.245\nsamples = 7\nvalue = [1, 6]\nclass = High'),
 Text(0.09119407238529495, 0.5681818181818182, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.10943288686235395, 0.5681818181818182, 'MntCoffee <= -0.249\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'),
 Text(0.10031347962382445, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.11855229410088344, 0.5227272727272727, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.10943288686235395, 0.6590909090909091, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'),
 Text(0.06839555428897122, 0.7954545454545454, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.16414933029353093, 0.8409090909090909, 'MntMeatProducts <= -0.498\ngini = 0.484\nsamples = 39\nvalue = [23, 16]\nclass = Low'),
 Text(0.15502992305500143, 0.7954545454545454, 'Income <= -0.104\ngini = 0.451\nsamples = 35\nvalue = [23, 12]\nclass = Low'),
 Text(0.12767170133941294, 0.75, 'NumWebVisitsMonth <= -1.45\ngini = 0.255\nsamples = 20\nvalue = [17, 3]\nclass = Low'),
 Text(0.11855229410088344, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.13679110857794244, 0.7045454545454546, 'MntGoldProds <= 0.14\ngini = 0.188\nsamples = 19\nvalue = [17, 2]\nclass = Low'),
 Text(0.12767170133941294, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.14591051581647194, 0.6590909090909091, 'MntFishProducts <= -0.019\ngini = 0.105\nsamples = 18\nvalue = [17, 1]\nclass = Low'),
 Text(0.13679110857794244, 0.6136363636363636, 'gini = 0.0\nsamples = 16\nvalue = [16, 0]\nclass = Low'),
 Text(0.15502992305500143, 0.6136363636363636, 'MntGoldProds <= 1.232\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.14591051581647194, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.16414933029353093, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.1823881447705899, 0.75, 'MntFruits <= -0.184\ngini = 0.48\nsamples = 15\nvalue = [6, 9]\nclass = High'),
 Text(0.17326873753206043, 0.7045454545454546, 'MntFishProducts <= -0.586\ngini = 0.496\nsamples = 11\nvalue = [6, 5]\nclass = Low'),
 Text(0.16414933029353093, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.1823881447705899, 0.6590909090909091, 'Income <= -0.052\ngini = 0.375\nsamples = 8\nvalue = [6, 2]\nclass = Low'),
 Text(0.17326873753206043, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.1915075520091194, 0.6136363636363636, 'gini = 0.0\nsamples = 6\nvalue = [6, 0]\nclass = Low'),
 Text(0.1915075520091194, 0.7045454545454546, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.17326873753206043, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.1299515531490453, 0.8863636363636364, 'gini = 0.0\nsamples = 45\nvalue = [0, 45]\nclass = High'),
 Text(0.37403262147335425, 0.9318181818181818, 'MntCoffee <= -0.539\ngini = 0.23\nsamples = 799\nvalue = [106.0, 693.0]\nclass = High'),
 Text(0.24622399544029638, 0.8863636363636364, 'MntGoldProds <= -0.797\ngini = 0.452\nsamples = 55\nvalue = [36, 19]\nclass = Low'),
 Text(0.23710458820176689, 0.8409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'),
 Text(0.2553434026788259, 0.8409090909090909, 'MntSweetProducts <= 1.222\ngini = 0.403\nsamples = 50\nvalue = [36.0, 14.0]\nclass = Low'),
 Text(0.24622399544029638, 0.7954545454545454, 'Kidhome <= 1.963\ngini = 0.34\nsamples = 46\nvalue = [36, 10]\nclass = Low'),
 Text(0.23710458820176689, 0.75, 'MntGoldProds <= 2.362\ngini = 0.298\nsamples = 44\nvalue = [36, 8]\nclass = Low'),
 Text(0.2279851809632374, 0.7045454545454546, 'MntMeatProducts <= 3.731\ngini = 0.273\nsamples = 43\nvalue = [36.0, 7.0]\nclass = Low'),
 Text(0.2188657737247079, 0.6590909090909091, 'MntFruits <= -0.649\ngini = 0.245\nsamples = 42\nvalue = [36, 6]\nclass = Low'),
 Text(0.2097463664861784, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.2279851809632374, 0.6136363636363636, 'Income <= -0.082\ngini = 0.214\nsamples = 41\nvalue = [36, 5]\nclass = Low'),
 Text(0.2188657737247079, 0.5681818181818182, 'Income <= -0.235\ngini = 0.33\nsamples = 24\nvalue = [19, 5]\nclass = Low'),
 Text(0.2006269592476489, 0.5227272727272727, 'MntFruits <= 0.194\ngini = 0.1\nsamples = 19\nvalue = [18, 1]\nclass = Low'),
 Text(0.1915075520091194, 0.4772727272727273, 'gini = 0.0\nsamples = 18\nvalue = [18, 0]\nclass = Low'),
 Text(0.2097463664861784, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.23710458820176689, 0.5227272727272727, 'MntGoldProds <= 0.334\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'),
 Text(0.2279851809632374, 0.4772727272727273, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.24622399544029638, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.23710458820176689, 0.5681818181818182, 'gini = 0.0\nsamples = 17\nvalue = [17, 0]\nclass = Low'),
 Text(0.23710458820176689, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.24622399544029638, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.2553434026788259, 0.75, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.2644628099173554, 0.7954545454545454, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.5018412475064121, 0.8863636363636364, 'MntMeatProducts <= -0.379\ngini = 0.17\nsamples = 744\nvalue = [70, 674]\nclass = High'),
 Text(0.3328583642063266, 0.8409090909090909, 'MntCoffee <= -0.223\ngini = 0.398\nsamples = 62\nvalue = [17, 45]\nclass = High'),
 Text(0.2918210316329439, 0.7954545454545454, 'Income <= 0.18\ngini = 0.49\nsamples = 21\nvalue = [12, 9]\nclass = Low'),
 Text(0.2735822171558849, 0.75, 'MntGoldProds <= 0.092\ngini = 0.391\nsamples = 15\nvalue = [11, 4]\nclass = Low'),
 Text(0.2644628099173554, 0.7045454545454546, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'),
 Text(0.2827016243944144, 0.7045454545454546, 'NumWebVisitsMonth <= 0.048\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'),
 Text(0.2735822171558849, 0.6590909090909091, 'MntFruits <= 0.168\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'),
 Text(0.2644628099173554, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.2827016243944144, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.2918210316329439, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.31005984611000287, 0.75, 'MntMeatProducts <= -0.392\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'),
 Text(0.30094043887147337, 0.7045454545454546, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'),
 Text(0.31917925334853237, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.3738956967797093, 0.7954545454545454, 'MntMeatProducts <= -0.401\ngini = 0.214\nsamples = 41\nvalue = [5, 36]\nclass = High'),
 Text(0.34653747506412086, 0.75, 'MntCoffee <= 1.135\ngini = 0.157\nsamples = 35\nvalue = [3, 32]\nclass = High'),
 Text(0.33741806782559136, 0.7045454545454546, 'gini = 0.0\nsamples = 24\nvalue = [0, 24]\nclass = High'),
 Text(0.3556568823026503, 0.7045454545454546, 'MntCoffee <= 1.495\ngini = 0.397\nsamples = 11\nvalue = [3, 8]\nclass = High'),
 Text(0.34653747506412086, 0.6590909090909091, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'),
 Text(0.3647762895411798, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'),
 Text(0.4012539184952978, 0.75, 'NumWebVisitsMonth <= 0.477\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'),
 Text(0.3921345112567683, 0.7045454545454546, 'MntSweetProducts <= -0.377\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'),
 Text(0.3830151040182388, 0.6590909090909091, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.4012539184952978, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.4103733257338273, 0.7045454545454546, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.6708241308064976, 0.8409090909090909, 'NumWebVisitsMonth <= -1.665\ngini = 0.143\nsamples = 682\nvalue = [53, 629]\nclass = High'),
 Text(0.4901681390709604, 0.7954545454545454, 'Income <= 0.553\ngini = 0.329\nsamples = 101\nvalue = [21, 80]\nclass = High'),
 Text(0.4468509546879453, 0.75, 'MntMeatProducts <= 0.902\ngini = 0.499\nsamples = 19\nvalue = [10, 9]\nclass = Low'),
 Text(0.4286121402108863, 0.7045454545454546, 'MntGoldProds <= 2.72\ngini = 0.18\nsamples = 10\nvalue = [9, 1]\nclass = Low'),
 Text(0.4194927329723568, 0.6590909090909091, 'gini = 0.0\nsamples = 9\nvalue = [9, 0]\nclass = Low'),
 Text(0.4377315474494158, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.4650897691650043, 0.7045454545454546, 'MntCoffee <= -0.328\ngini = 0.198\nsamples = 9\nvalue = [1, 8]\nclass = High'),
 Text(0.4559703619264748, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.47420917640353377, 0.6590909090909091, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'),
 Text(0.5334853234539755, 0.75, 'MntCoffee <= 0.736\ngini = 0.232\nsamples = 82\nvalue = [11, 71]\nclass = High'),
 Text(0.5015673981191222, 0.7045454545454546, 'MntMeatProducts <= 1.79\ngini = 0.081\nsamples = 47\nvalue = [2, 45]\nclass = High'),
 Text(0.49244799088059277, 0.6590909090909091, 'gini = 0.0\nsamples = 42\nvalue = [0, 42]\nclass = High'),
 Text(0.5106868053576518, 0.6590909090909091, 'MntMeatProducts <= 2.054\ngini = 0.48\nsamples = 5\nvalue = [2, 3]\nclass = High'),
 Text(0.5015673981191222, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.5198062125961812, 0.6136363636363636, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.5654032487888287, 0.7045454545454546, 'MntMeatProducts <= 0.601\ngini = 0.382\nsamples = 35\nvalue = [9, 26]\nclass = High'),
 Text(0.5471644343117698, 0.6590909090909091, 'MntCoffee <= 1.449\ngini = 0.444\nsamples = 6\nvalue = [4, 2]\nclass = Low'),
 Text(0.5380450270732402, 0.6136363636363636, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'),
 Text(0.5562838415502992, 0.6136363636363636, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.5836420632658877, 0.6590909090909091, 'MntMeatProducts <= 1.241\ngini = 0.285\nsamples = 29\nvalue = [5, 24]\nclass = High'),
 Text(0.5745226560273582, 0.6136363636363636, 'gini = 0.0\nsamples = 14\nvalue = [0, 14]\nclass = High'),
 Text(0.5927614705044172, 0.6136363636363636, 'MntFishProducts <= 1.171\ngini = 0.444\nsamples = 15\nvalue = [5, 10]\nclass = High'),
 Text(0.5745226560273582, 0.5681818181818182, 'MntCoffee <= 0.88\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'),
 Text(0.5654032487888287, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.5836420632658877, 0.5227272727272727, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'),
 Text(0.6110002849814762, 0.5681818181818182, 'MntGoldProds <= -0.159\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'),
 Text(0.6018808777429467, 0.5227272727272727, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'),
 Text(0.6201196922200057, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.8514801225420348, 0.7954545454545454, 'Income <= -0.887\ngini = 0.104\nsamples = 581\nvalue = [32, 549]\nclass = High'),
 Text(0.8024722143060701, 0.75, 'MntSweetProducts <= -0.474\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.7933528070675406, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.8115916215445996, 0.7045454545454546, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.9004880307779994, 0.75, 'MntFruits <= 4.205\ngini = 0.101\nsamples = 579\nvalue = [31, 548]\nclass = High'),
 Text(0.8298304360216586, 0.7045454545454546, 'MntSweetProducts <= -0.232\ngini = 0.099\nsamples = 577\nvalue = [30, 547]\nclass = High'),
 Text(0.7158734682245654, 0.6590909090909091, 'MntFruits <= 3.413\ngini = 0.159\nsamples = 172\nvalue = [15, 157]\nclass = High'),
 Text(0.7067540609860359, 0.6136363636363636, 'MntCoffee <= 3.385\ngini = 0.15\nsamples = 171\nvalue = [14, 157]\nclass = High'),
 Text(0.6976346537475064, 0.5681818181818182, 'MntCoffee <= -0.269\ngini = 0.141\nsamples = 170\nvalue = [13.0, 157.0]\nclass = High'),
 Text(0.6383585066970647, 0.5227272727272727, 'NumWebVisitsMonth <= -0.594\ngini = 0.358\nsamples = 30\nvalue = [7, 23]\nclass = High'),
 Text(0.6292390994585352, 0.4772727272727273, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.6474779139355942, 0.4772727272727273, 'MntMeatProducts <= -0.248\ngini = 0.293\nsamples = 28\nvalue = [5, 23]\nclass = High'),
 Text(0.6383585066970647, 0.4318181818181818, 'MntCoffee <= -0.366\ngini = 0.496\nsamples = 11\nvalue = [5, 6]\nclass = High'),
 Text(0.6201196922200057, 0.38636363636363635, 'MntMeatProducts <= -0.275\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'),
 Text(0.6110002849814762, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.6292390994585352, 0.3409090909090909, 'MntSweetProducts <= -0.535\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.6201196922200057, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.6383585066970647, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.6565973211741237, 0.38636363636363635, 'MntGoldProds <= 1.029\ngini = 0.32\nsamples = 5\nvalue = [4, 1]\nclass = Low'),
 Text(0.6474779139355942, 0.3409090909090909, 'gini = 0.0\nsamples = 4\nvalue = [4, 0]\nclass = Low'),
 Text(0.6657167284126532, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.6565973211741237, 0.4318181818181818, 'gini = 0.0\nsamples = 17\nvalue = [0, 17]\nclass = High'),
 Text(0.7569108007979481, 0.5227272727272727, 'MntFruits <= 1.149\ngini = 0.082\nsamples = 140\nvalue = [6, 134]\nclass = High'),
 Text(0.7204331718438302, 0.4772727272727273, 'Income <= 0.845\ngini = 0.046\nsamples = 127\nvalue = [3, 124]\nclass = High'),
 Text(0.7021943573667712, 0.4318181818181818, 'MntGoldProds <= -0.671\ngini = 0.018\nsamples = 112\nvalue = [1, 111]\nclass = High'),
 Text(0.6930749501282417, 0.38636363636363635, 'NumWebVisitsMonth <= -1.236\ngini = 0.278\nsamples = 6\nvalue = [1, 5]\nclass = High'),
 Text(0.6839555428897122, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.7021943573667712, 0.3409090909090909, 'gini = 0.0\nsamples = 5\nvalue = [0, 5]\nclass = High'),
 Text(0.7113137646053006, 0.38636363636363635, 'gini = 0.0\nsamples = 106\nvalue = [0, 106]\nclass = High'),
 Text(0.7386719863208892, 0.4318181818181818, 'Income <= 0.884\ngini = 0.231\nsamples = 15\nvalue = [2, 13]\nclass = High'),
 Text(0.7295525790823596, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.7477913935594186, 0.38636363636363635, 'MntMeatProducts <= -0.159\ngini = 0.133\nsamples = 14\nvalue = [1, 13]\nclass = High'),
 Text(0.7386719863208892, 0.3409090909090909, 'MntFishProducts <= -0.238\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.7295525790823596, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.7477913935594186, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.7569108007979481, 0.3409090909090909, 'gini = 0.0\nsamples = 12\nvalue = [0, 12]\nclass = High'),
 Text(0.7933884297520661, 0.4772727272727273, 'MntMeatProducts <= 1.613\ngini = 0.355\nsamples = 13\nvalue = [3, 10]\nclass = High'),
 Text(0.7751496152750071, 0.4318181818181818, 'MntFruits <= 1.262\ngini = 0.18\nsamples = 10\nvalue = [1, 9]\nclass = High'),
 Text(0.7660302080364776, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.7842690225135366, 0.38636363636363635, 'gini = 0.0\nsamples = 9\nvalue = [0, 9]\nclass = High'),
 Text(0.8116272442291251, 0.4318181818181818, 'MntFruits <= 2.696\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'),
 Text(0.8025078369905956, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.8207466514676546, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.7158734682245654, 0.5681818181818182, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.7249928754630949, 0.6136363636363636, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9437874038187518, 0.6590909090909091, 'MntSweetProducts <= 3.415\ngini = 0.071\nsamples = 405\nvalue = [15, 390]\nclass = High'),
 Text(0.9058136221145625, 0.6136363636363636, 'MntFruits <= 1.111\ngini = 0.06\nsamples = 388\nvalue = [12, 376]\nclass = High'),
 Text(0.8966942148760331, 0.5681818181818182, 'MntFruits <= 1.086\ngini = 0.087\nsamples = 262\nvalue = [12, 250]\nclass = High'),
 Text(0.8875748076375035, 0.5227272727272727, 'MntCoffee <= -0.388\ngini = 0.081\nsamples = 261\nvalue = [11, 250]\nclass = High'),
 Text(0.8389854659447136, 0.4772727272727273, 'Income <= 0.256\ngini = 0.32\nsamples = 10\nvalue = [2, 8]\nclass = High'),
 Text(0.8298660587061841, 0.4318181818181818, 'gini = 0.0\nsamples = 7\nvalue = [0, 7]\nclass = High'),
 Text(0.8481048731832431, 0.4318181818181818, 'MntFruits <= 0.143\ngini = 0.444\nsamples = 3\nvalue = [2, 1]\nclass = Low'),
 Text(0.8389854659447136, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.8572242804217726, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.9361641493302936, 0.4772727272727273, 'MntGoldProds <= 3.0\ngini = 0.069\nsamples = 251\nvalue = [9, 242]\nclass = High'),
 Text(0.9068110572812768, 0.4318181818181818, 'Income <= 1.507\ngini = 0.062\nsamples = 248\nvalue = [8, 240]\nclass = High'),
 Text(0.8754630948988316, 0.38636363636363635, 'MntFruits <= 0.86\ngini = 0.05\nsamples = 236\nvalue = [6, 230]\nclass = High'),
 Text(0.8401253918495298, 0.3409090909090909, 'MntCoffee <= 0.533\ngini = 0.035\nsamples = 222\nvalue = [4, 218]\nclass = High'),
 Text(0.8310059846110003, 0.29545454545454547, 'gini = 0.0\nsamples = 105\nvalue = [0, 105]\nclass = High'),
 Text(0.8492447990880593, 0.29545454545454547, 'MntCoffee <= 0.546\ngini = 0.066\nsamples = 117\nvalue = [4, 113]\nclass = High'),
 Text(0.8241664291821031, 0.25, 'MntFishProducts <= 1.748\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.8150470219435737, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.8332858364206327, 0.20454545454545456, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.8743231689940154, 0.25, 'NumWebVisitsMonth <= 0.905\ngini = 0.051\nsamples = 115\nvalue = [3, 112]\nclass = High'),
 Text(0.8515246508976917, 0.20454545454545456, 'NumWebVisitsMonth <= -0.808\ngini = 0.037\nsamples = 107\nvalue = [2.0, 105.0]\nclass = High'),
 Text(0.8424052436591621, 0.1590909090909091, 'MntCoffee <= 0.726\ngini = 0.087\nsamples = 44\nvalue = [2, 42]\nclass = High'),
 Text(0.8241664291821031, 0.11363636363636363, 'MntMeatProducts <= 0.326\ngini = 0.375\nsamples = 4\nvalue = [1, 3]\nclass = High'),
 Text(0.8150470219435737, 0.06818181818181818, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.8332858364206327, 0.06818181818181818, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.8606440581362211, 0.11363636363636363, 'MntSweetProducts <= 0.059\ngini = 0.049\nsamples = 40\nvalue = [1, 39]\nclass = High'),
 Text(0.8515246508976917, 0.06818181818181818, 'MntGoldProds <= -0.072\ngini = 0.32\nsamples = 5\nvalue = [1, 4]\nclass = High'),
 Text(0.8424052436591621, 0.022727272727272728, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.8606440581362211, 0.022727272727272728, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.8697634653747507, 0.06818181818181818, 'gini = 0.0\nsamples = 35\nvalue = [0, 35]\nclass = High'),
 Text(0.8606440581362211, 0.1590909090909091, 'gini = 0.0\nsamples = 63\nvalue = [0, 63]\nclass = High'),
 Text(0.8971216870903391, 0.20454545454545456, 'MntMeatProducts <= -0.071\ngini = 0.219\nsamples = 8\nvalue = [1, 7]\nclass = High'),
 Text(0.8880022798518097, 0.1590909090909091, 'MntCoffee <= 1.637\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.8788828726132801, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.8971216870903391, 0.11363636363636363, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.9062410943288686, 0.1590909090909091, 'gini = 0.0\nsamples = 6\nvalue = [0, 6]\nclass = High'),
 Text(0.9108007979481334, 0.3409090909090909, 'MntCoffee <= 0.357\ngini = 0.245\nsamples = 14\nvalue = [2, 12]\nclass = High'),
 Text(0.9016813907096038, 0.29545454545454547, 'MntFishProducts <= 1.949\ngini = 0.444\nsamples = 6\nvalue = [2, 4]\nclass = High'),
 Text(0.8925619834710744, 0.25, 'gini = 0.0\nsamples = 2\nvalue = [2, 0]\nclass = Low'),
 Text(0.9108007979481334, 0.25, 'gini = 0.0\nsamples = 4\nvalue = [0, 4]\nclass = High'),
 Text(0.9199202051866628, 0.29545454545454547, 'gini = 0.0\nsamples = 8\nvalue = [0, 8]\nclass = High'),
 Text(0.9381590196637218, 0.38636363636363635, 'MntFishProducts <= -0.477\ngini = 0.278\nsamples = 12\nvalue = [2, 10]\nclass = High'),
 Text(0.9290396124251924, 0.3409090909090909, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9472784269022514, 0.3409090909090909, 'Kidhome <= 0.105\ngini = 0.165\nsamples = 11\nvalue = [1, 10]\nclass = High'),
 Text(0.9381590196637218, 0.29545454545454547, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'),
 Text(0.9563978341407808, 0.29545454545454547, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9655172413793104, 0.4318181818181818, 'MntCoffee <= 0.14\ngini = 0.444\nsamples = 3\nvalue = [1, 2]\nclass = High'),
 Text(0.9563978341407808, 0.38636363636363635, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9746366486178398, 0.38636363636363635, 'gini = 0.0\nsamples = 2\nvalue = [0, 2]\nclass = High'),
 Text(0.9058136221145625, 0.5227272727272727, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9149330293530921, 0.5681818181818182, 'gini = 0.0\nsamples = 126\nvalue = [0, 126]\nclass = High'),
 Text(0.981761185522941, 0.6136363636363636, 'Income <= 0.691\ngini = 0.291\nsamples = 17\nvalue = [3, 14]\nclass = High'),
 Text(0.9726417782844116, 0.5681818181818182, 'MntSweetProducts <= 3.754\ngini = 0.49\nsamples = 7\nvalue = [3, 4]\nclass = High'),
 Text(0.963522371045882, 0.5227272727272727, 'MntGoldProds <= 1.493\ngini = 0.375\nsamples = 4\nvalue = [3, 1]\nclass = Low'),
 Text(0.9544029638073526, 0.4772727272727273, 'gini = 0.0\nsamples = 3\nvalue = [3, 0]\nclass = Low'),
 Text(0.9726417782844116, 0.4772727272727273, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High'),
 Text(0.981761185522941, 0.5227272727272727, 'gini = 0.0\nsamples = 3\nvalue = [0, 3]\nclass = High'),
 Text(0.9908805927614706, 0.5681818181818182, 'gini = 0.0\nsamples = 10\nvalue = [0, 10]\nclass = High'),
 Text(0.9711456255343403, 0.7045454545454546, 'MntMeatProducts <= 0.166\ngini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Low'),
 Text(0.9620262182958108, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [1, 0]\nclass = Low'),
 Text(0.9802650327728698, 0.6590909090909091, 'gini = 0.0\nsamples = 1\nvalue = [0, 1]\nclass = High')]
No description has been provided for this image

Ensemble Learning Methods¶

In [68]:
class RandForestClassifier:
    def __init__(self, x_train, y_train, x_test, y_test, n_estimators: int = 100, max_depth: int = 8):

        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.set_params()

    def set_params(self):
        self.randf = RandomForestClassifier(criterion='entropy',
                                            max_depth=self.max_depth,
                                            n_estimators=self.n_estimators,
                                            random_state=1)
        self.randf.fit(self.x_train, self.y_train)
        self.randf_predict = self.randf.predict(self.x_test)

    def accuracy_test(self) -> float:
        return metrics.accuracy_score(self.y_test, self.randf_predict)

    def accuracy_train(self) -> float:
        train_predict = self.randf.predict(self.x_train)
        return metrics.accuracy_score(self.y_train, train_predict)

    def confusion_matrix(self):
        matrix_randf = metrics.confusion_matrix(self.y_test, self.randf_predict)
        matrix_disp = metrics.ConfusionMatrixDisplay(matrix_randf)
        matrix_disp.plot(cmap='Blues')
        plt.grid(False)
        plt.title('Random Forest Confusion Matrix')
        plt.show()

    def grid_search(self) -> tuple[float, GridSearchCV]:
        search_params = {
            'n_estimators': range(70, 230, 30),
            'criterion': ['entropy'],
            'max_depth': range(4, 10),
            'random_state': [1]
        }
        grid = GridSearchCV(self.randf, search_params, scoring='accuracy', n_jobs=2)
        grid.fit(self.x_train, self.y_train)
        test_score = grid.score(self.x_test, self.y_test)
        return test_score, grid
In [69]:
randf_model = RandForestClassifier(x_train, y_train, x_test, y_test)

print(f'Random Forest Train Acc: {randf_model.accuracy_train() * 100:.3f}%')
print(f'Random Forest Test Acc: {randf_model.accuracy_test() * 100:.3f}%')
Random Forest Train Acc: 96.556%
Random Forest Test Acc: 92.560%
In [70]:
randf_model.confusion_matrix()
No description has been provided for this image
In [71]:
grid_res = randf_model.grid_search()
print(f'Random Forest\nTest Score: {grid_res[0]}\nParams: {grid_res[1].best_params_}')
Random Forest
Test Score: 0.9241071428571429
Params: {'criterion': 'entropy', 'max_depth': 7, 'n_estimators': 100, 'random_state': 1}

Hyperparameters

In [72]:
def n_estimators_effects():
    n_estimators = range(1, 200, 10)
    train_res = []
    test_res = []
    for est_count in n_estimators:
        randf = RandForestClassifier(x_train, y_train, x_test, y_test, n_estimators=est_count)
        test_res.append(randf.accuracy_test())
        train_res.append(randf.accuracy_train())

    plt.plot(n_estimators, test_res)
    plt.plot(n_estimators, train_res)
    plt.xlabel('n_estimators')
    plt.ylabel('Accuracy')
    plt.legend(loc="lower right")


def max_depth_effects():
    max_depths = range(1, 20)
    train_res = []
    test_res = []
    for depth in max_depths:
        randf = RandForestClassifier(x_train, y_train, x_test, y_test, max_depth=depth)
        test_res.append(randf.accuracy_test())
        train_res.append(randf.accuracy_train())

    plt.plot(max_depths, test_res)
    plt.plot(max_depths, train_res)
    plt.xlabel('max_depth')
    plt.ylabel('Accuracy')
    plt.legend(loc="lower right")


plt.figure(figsize=(10, 4))
plt.suptitle('Hyperparameter Effects on Accuracy on Random Forest Classifier')
plt.subplot(1, 2, 1)
n_estimators_effects()
plt.subplot(1, 2, 2)
max_depth_effects()
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1355520063.py:14: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend(loc="lower right")
C:\Users\ASUS\AppData\Local\Temp\ipykernel_22524\1355520063.py:30: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend(loc="lower right")
No description has been provided for this image

Comparative Analysis of Bias and Variance

In [73]:
dtree = DecisionTreeClassifier(** dtree_params)
_, dtree_bias, dtree_var = bias_variance_decomp(
                                dtree,
                                x_train.values, y_train.values,
                                x_test.values, y_test.values,
                                loss='mse', random_seed=10
                           )
In [74]:
_, randf_bias, randf_var = bias_variance_decomp(randf_model.randf,
                                                x_train.values, y_train.values,
                                                x_test.values, y_test.values,
                                                loss='mse', random_seed=1)
In [75]:
display(HTML(f'<b>Decision Tree Bias:</b>     {dtree_bias:.3f}'))
display(HTML(f'<b>Decision Tree Variance:</b> {dtree_var:.3f}'))
display(HTML(f'<b>Random Forest Bias:</b>     {randf_bias:.3f}'))
display(HTML(f'<b>Random Forest Variance:</b> {randf_var:.3f}'))
Decision Tree Bias: 0.069
Decision Tree Variance: 0.024
Random Forest Bias: 0.068
Random Forest Variance: 0.012

The comparison here is between the Decision Tree model and the Random Forest model.

Bias refers to the difference between the average prediction of a model and the actual value we aim to predict. High bias can lead to underfitting, where the model fails to capture the underlying relationships between features and target outputs.

Variance measures the variability of model predictions for a given data point, indicating the spread of the data. High variance can lead to overfitting, where the model learns the noise in the training data instead of the true signal.

Random Forests address these issues by using an ensemble of multiple Decision Trees. This approach helps to

  • reduce variance
  • improve generalization

In contrast, Decision Trees alone tend to overfit the training data and typically exhibit

  • higher variance
  • lower bias
In [76]:
plot_tree(dtree) #This is best dtree with best_params
Out[76]:
[Text(0.41304347826086957, 0.9, 'x[4] <= -0.458\nentropy = 0.999\nsamples = 1568\nvalue = [812.0, 756.0]'),
 Text(0.17391304347826086, 0.7, 'x[2] <= -0.414\nentropy = 0.474\nsamples = 798\nvalue = [717, 81]'),
 Text(0.08695652173913043, 0.5, 'x[7] <= 0.421\nentropy = 0.032\nsamples = 608\nvalue = [606, 2]'),
 Text(0.043478260869565216, 0.3, 'entropy = 0.0\nsamples = 600\nvalue = [600, 0]'),
 Text(0.13043478260869565, 0.3, 'entropy = 0.811\nsamples = 8\nvalue = [6, 2]'),
 Text(0.2608695652173913, 0.5, 'x[2] <= 0.06\nentropy = 0.979\nsamples = 190\nvalue = [111.0, 79.0]'),
 Text(0.21739130434782608, 0.3, 'x[4] <= -0.625\nentropy = 0.803\nsamples = 147\nvalue = [111.0, 36.0]'),
 Text(0.17391304347826086, 0.1, 'entropy = 0.116\nsamples = 64\nvalue = [63, 1]'),
 Text(0.2608695652173913, 0.1, 'entropy = 0.982\nsamples = 83\nvalue = [48, 35]'),
 Text(0.30434782608695654, 0.3, 'entropy = 0.0\nsamples = 43\nvalue = [0, 43]'),
 Text(0.6521739130434783, 0.7, 'x[2] <= -0.362\nentropy = 0.539\nsamples = 770\nvalue = [95, 675]'),
 Text(0.4782608695652174, 0.5, 'x[4] <= -0.368\nentropy = 0.997\nsamples = 103\nvalue = [48, 55]'),
 Text(0.391304347826087, 0.3, 'x[2] <= -0.512\nentropy = 0.746\nsamples = 33\nvalue = [26, 7]'),
 Text(0.34782608695652173, 0.1, 'entropy = 0.439\nsamples = 22\nvalue = [20, 2]'),
 Text(0.43478260869565216, 0.1, 'entropy = 0.994\nsamples = 11\nvalue = [6, 5]'),
 Text(0.5652173913043478, 0.3, 'x[6] <= 0.859\nentropy = 0.898\nsamples = 70\nvalue = [22, 48]'),
 Text(0.5217391304347826, 0.1, 'entropy = 0.958\nsamples = 58\nvalue = [22, 36]'),
 Text(0.6086956521739131, 0.1, 'entropy = 0.0\nsamples = 12\nvalue = [0, 12]'),
 Text(0.8260869565217391, 0.5, 'x[8] <= -1.236\nentropy = 0.368\nsamples = 667\nvalue = [47, 620]'),
 Text(0.7391304347826086, 0.3, 'x[6] <= 0.217\nentropy = 0.577\nsamples = 211\nvalue = [29, 182]'),
 Text(0.6956521739130435, 0.1, 'entropy = 0.908\nsamples = 65\nvalue = [21, 44]'),
 Text(0.782608695652174, 0.1, 'entropy = 0.306\nsamples = 146\nvalue = [8, 138]'),
 Text(0.9130434782608695, 0.3, 'x[4] <= -0.166\nentropy = 0.24\nsamples = 456\nvalue = [18, 438]'),
 Text(0.8695652173913043, 0.1, 'entropy = 0.506\nsamples = 125\nvalue = [14, 111]'),
 Text(0.9565217391304348, 0.1, 'entropy = 0.094\nsamples = 331\nvalue = [4, 327]')]
No description has been provided for this image

Differential privacy¶

Adding noise to a dataset is a technique for data anonymization, which enhances privacy and security.
It makes identifying individuals in the dataset difficult and decreases risks of re-identification and inference attacks. However, it may reduce data quality and utility.
The impact of noise addition varies based on the specific tasks and applications for which the data is used.
If noise addition follows a differential privacy framework, it guarantees privacy protection. Balancing privacy preservation and data utility is crucial.

Common methods fo add noise to data:

  • Pulsed Noise
    • Introduces abrupt changes resembling pulses, often used to simulate sudden disturbances
  • Gaussian Noise
    • Adds smooth, continuous variation following a normal distribution, commonly used in statistical modeling and simulations for a more natural variation.
In [106]:
sensitivity_range  = [x / 10.0 for x in range(1, 30, 1)]

def noise_gen(data, sensitivity, eps=1.0):
    return np.random.laplace(0, scale=sensitivity / eps, size=data.shape)

def add_noise():
    nx_train=[]
    nx_test=[]
    for sensitivity in sensitivity_range:
        nx_train.append(x_train + noise_gen(x_train, sensitivity))
        nx_test.append(x_test + noise_gen(x_test, sensitivity))
    return nx_train, nx_test
In [107]:
noisy_X_trains, noisy_X_tests = add_noise()
In [108]:
def plot_accuracy_vs_sensitivity(class_type, nX_trains, y_train, nX_tests, y_test):
    scores = [0] * len(sensitivity_range)
    for i in range(len(sensitivity_range)):
        DT = Classifier(class_type, nX_trains[i], y_train, nX_tests[i], y_test)
        scores[i] = DT.accuracy_test()
    plt.plot(sensitivity_range, scores)
    plt.xlabel('sensitivity')
    plt.ylabel('Accuracy')
    plt.title('Decision Tree Accuracy with noise')
    plt.show()
In [109]:
plot_accuracy_vs_sensitivity(
    DecisionTreeClassifier, 
    noisy_X_trains, y_train, 
    noisy_X_tests, y_test
)
No description has been provided for this image
In [110]:
plot_accuracy_vs_sensitivity(
    KNeighborsClassifier, 
    noisy_X_trains, y_train, 
    noisy_X_tests, y_test
)
No description has been provided for this image
In [111]:
plot_accuracy_vs_sensitivity(
    LogisticRegression, 
    noisy_X_trains, y_train, 
    noisy_X_tests, y_test
)
No description has been provided for this image
In [112]:
noisy_dtree_model = Classifier(DecisionTreeClassifier, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
noisy_knear_model = Classifier(KNeighborsClassifier, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
noisy_logreg_model = Classifier(LogisticRegression, noisy_X_trains[3], y_train, noisy_X_tests[3], y_test)
In [113]:
# Print accuracy on the noisy dataset
print(f'Noisy Decision Tree:       {noisy_dtree_model.accuracy_test() * 100:.3f}%')
print(f'Noisy K Nearest Neighbors: {noisy_knear_model.accuracy_test() * 100:.3f}%')
print(f'Noisy Logistic Regression: {noisy_logreg_model.accuracy_test() * 100:.3f}%')
Noisy Decision Tree:       78.423%
Noisy K Nearest Neighbors: 83.929%
Noisy Logistic Regression: 86.607%
In [114]:
# Compare confusion matrices
noisy_dtree_model.confusion_matrix()
noisy_knear_model.confusion_matrix()
noisy_logreg_model.confusion_matrix()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Gradient-boosting¶

Gradient boosting is an ensemble learning technique used for both regression and classification problems.
It builds a series of weak learners, typically decision trees, sequentially.
Each new tree corrects the errors of the combined ensemble of the existing trees.
The predictions of each tree are weighted based on their performance, with more accurate trees receiving higher weights.

Working Process

  • Initial Model:
    Start with a simple model, often a shallow decision tree.
  • Residuals Calculation:
    Calculate the residuals (differences between predictions and actual values) for the current model.
  • Next Model:
    Build a new weak learner (tree) that focuses on minimizing the residuals of the previous model.
  • Weighted Combination:
    Combine the new model with the previous ones, giving more weight to accurate models and less weight to less accurate ones.
  • Iteration:
    Repeat steps 2-4 until a predefined number of models are created or until a specified level of performance is achieved.

Difference Between Boosting Trees and Decision Trees

As the Decision Trees are a simple and interpretable model that can be used for both regression and classification tasks. They work by recursively splitting the data into subsets based on the values of the input variables, until a stopping criterion is met. The resulting tree structure can be visualized and interpreted, making it easy to understand how the model is making predictions.

  • Gradient Boosting is an ensemble method that combines multiple decision trees to make predictions, while Decision Trees are standalone models that make predictions based on a single tree.
  • Gradient Boosting can handle non-linear relationships between the input variables and the output variable, while Decision Trees are limited to linear relationships.
  • Gradient Boosting can also handle missing data and outliers more effectively than Decision Trees, as it is less prone to overfitting.

XGBoost and its functionality¶

Extreme Gradient Boosting (XGBoost) is a popular implementation fo the Gradient Boosting algorithm. It is designed to be highly scalable and efficient.

Its gradient boosting technique minimizes a loss function, which measures how well the model can predict the target variable. One of XGBoost's key features is its ability to handle missing data and outliers. It does this by using regularization, which penalizes complex models and encourages simpler models that are less likely to overfit the data.

Its pruning technique removes branches of the decision tree that do not contribute to the model's overall accuracy. Another essential feature of XGBoost is its ability to handle sparse and dense data. It does this by using a technique called sparsity-aware split finding, which can more efficiently handle missing values and zero values in sparse data than traditional split finding algorithms.

Some features of XGBoost:

  • Support for custom loss functions
  • Early stopping to prevent overfitting
  • The ability to handle multi-class classification problems

This algorithm is used in a wide rage of machine learning tasks, including regression, classification, and ranking.

XGBoost algorithm¶

Important Hyperparameters for XGBoost

Hyperparameter Description
max_depth The maximum depth of each decision tree in the ensemble.
Increasing this value can make the model more complex and potentially more accurate but may also increase the risk of overfitting.
learning_rate The step size is used to update the model weights during each iteration.
A lower learning rate can make the model more conservative and less prone to overfitting but may also require more iterations to converge.
n_estimators The number of decision trees in the ensemble.
Increasing this value can make the model more accurate, but may also increase the risk of overfitting and make the model slower to train.
subsample The fraction of the training data used to train each decision tree.
Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting.
colsample_bytree The fraction of the features used to train each decision tree.
Setting this value to less than 1.0 can make the model more robust to noise and reduce overfitting.
gamma The minimum reduction in the loss function required to make a split at a node.
Increasing this value can make the model more conservative and less prone to overfitting.
reg_alpha L1 regularization term on weights.
Increasing this value can make the model more conservative and less prone to overfitting.
reg_lambda L2 regularization term on weights.
Increasing this value can make the model more conservative and less prone to overfitting.
In [82]:
xgb = XGBClassifier()

params = {
    'max_depth': range(3,11,2),
    'learning_rate': [0.01, 0.03, 0.09, 0.1],
    'n_estimators': range(20,201,30),
    'gamma': [0, 1, 10, 100]
    # 'max_depth': [3, 5, 7],
    # 'learning_rate': [0.1, 0.01],
    # 'n_estimators': [50, 100, 200],
    # # 'subsample': [0.5, 0.7, 1],
    # # 'colsample_bytree': [0.7, 1.0],
    # 'gamma': [0, 1, 5]
}

xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=5, scoring="accuracy")
xgb_grid_search.fit(x_train, y_train)
print(f"Best Accuracy: {xgb_grid_search.best_score_ * 100:2.2f}%")
print(f"Best Parameters: {xgb_grid_search.best_params_}")
print(f"Gradient-boosting Accuracy: {xgb_grid_search.score(x_test, y_test) * 100:2.2f}%")
Best Accuracy: 93.18%
Best Parameters: {'gamma': 1, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 200}
Gradient-boosting Accuracy: 92.56%
In [83]:
y_pred = xgb_grid_search.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['0', '1'], columns=['0', '1'])
sns.heatmap(cm_df, annot=True, cmap='Blues', fmt='g')
plt.title('XGBoost Confusion Matrix')
plt.show()
No description has been provided for this image
In [ ]: